diff --git a/.travis.yml b/.travis.yml index 1f2940404eed0..5a16c1a6c25e7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,7 @@ env: git: # for cloning - depth: 300 + depth: 500 matrix: fast_finish: true diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 9367c42f8d39a..5c5a1df4ea1f8 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -423,7 +423,7 @@ class frame_get_dtype_counts(object): goal_time = 0.2 def setup(self): - self.df = pandas.DataFrame(np.random.randn(10, 10000)) + self.df = DataFrame(np.random.randn(10, 10000)) def time_frame_get_dtype_counts(self): self.df.get_dtype_counts() @@ -985,3 +985,14 @@ def setup(self): def time_series_string_vector_slice(self): self.s.str[:5] + + +class frame_quantile_axis1(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(1000, 3), + columns=list('ABC')) + + def time_frame_quantile_axis1(self): + self.df.quantile([0.1, 0.5], axis=1) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 7279d73eb0d97..586bd00b091fe 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -773,6 +773,21 @@ def setup(self): def time_groupby_transform_series2(self): self.df.groupby('id')['val'].transform(np.mean) + +class groupby_transform_dataframe(object): + # GH 12737 + goal_time = 0.2 + + def setup(self): + self.df = pd.DataFrame({'group': np.repeat(np.arange(1000), 10), + 'B': np.nan, + 'C': np.nan}) + self.df.ix[4::10, 'B':'C'] = 5 + + def time_groupby_transform_dataframe(self): + self.df.groupby('group').transform('first') + + class groupby_transform_cythonized(object): goal_time = 0.2 diff --git a/asv_bench/benchmarks/parser_vb.py b/asv_bench/benchmarks/parser_vb.py index 18cd4de6cc9c5..04f25034638cd 100644 --- a/asv_bench/benchmarks/parser_vb.py +++ b/asv_bench/benchmarks/parser_vb.py @@ -23,18 +23,42 @@ class read_csv_default_converter(object): goal_time = 0.2 def setup(self): - self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n ' + self.data = """0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n +0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n +0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n +0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n +0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n""" self.data = (self.data * 200) def time_read_csv_default_converter(self): read_csv(StringIO(self.data), sep=',', header=None, float_precision=None) +class read_csv_default_converter_with_decimal(object): + goal_time = 0.2 + + def setup(self): + self.data = """0,1213700904466425978256438611;0,0525708283766902484401839501;0,4174092731488769913994474336\n +0,4096341697147408700274695547;0,1587830198973579909349496119;0,1292545832485494372576795285\n +0,8323255650024565799327547210;0,9694902427379478160318626578;0,6295047811546814475747169126\n +0,4679375305798131323697930383;0,2963942381834381301075609371;0,5268936082160610157032465394\n +0,6685382761849776311890991564;0,6721207066140679753374342908;0,6519975277021627935170045020\n""" + self.data = (self.data * 200) + + def time_read_csv_default_converter_with_decimal(self): + read_csv(StringIO(self.data), sep=';', header=None, + float_precision=None, decimal=',') + + class read_csv_precise_converter(object): goal_time = 0.2 def setup(self): - self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n ' + self.data = """0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n +0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n +0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n +0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n +0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n""" self.data = (self.data * 200) def time_read_csv_precise_converter(self): @@ -45,7 +69,11 @@ class read_csv_roundtrip_converter(object): goal_time = 0.2 def setup(self): - self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n ' + self.data = """0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n +0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n +0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n +0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n +0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n""" self.data = (self.data * 200) def time_read_csv_roundtrip_converter(self): @@ -109,4 +137,28 @@ def setup(self): self.data = (self.data * 200) def time_read_table_multiple_date_baseline(self): - read_table(StringIO(self.data), sep=',', header=None, parse_dates=[1]) \ No newline at end of file + read_table(StringIO(self.data), sep=',', header=None, parse_dates=[1]) + + +class read_csv_default_converter_python_engine(object): + goal_time = 0.2 + + def setup(self): + self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n ' + self.data = (self.data * 200) + + def time_read_csv_default_converter(self): + read_csv(StringIO(self.data), sep=',', header=None, + float_precision=None, engine='python') + + +class read_csv_default_converter_with_decimal_python_engine(object): + goal_time = 0.2 + + def setup(self): + self.data = '0,1213700904466425978256438611;0,0525708283766902484401839501;0,4174092731488769913994474336\n 0,4096341697147408700274695547;0,1587830198973579909349496119;0,1292545832485494372576795285\n 0,8323255650024565799327547210;0,9694902427379478160318626578;0,6295047811546814475747169126\n 0,4679375305798131323697930383;0,2963942381834381301075609371;0,5268936082160610157032465394\n 0,6685382761849776311890991564;0,6721207066140679753374342908;0,6519975277021627935170045020\n ' + self.data = (self.data * 200) + + def time_read_csv_default_converter_with_decimal(self): + read_csv(StringIO(self.data), sep=';', header=None, + float_precision=None, decimal=',', engine='python') diff --git a/ci/cron/go_doc.sh b/ci/cron/go_doc.sh deleted file mode 100755 index 89659577d0e7f..0000000000000 --- a/ci/cron/go_doc.sh +++ /dev/null @@ -1,99 +0,0 @@ -#!/bin/bash - -# This is a one-command cron job for setting up -# a virtualenv-based, linux-based, py2-based environment -# for building the Pandas documentation. -# -# The first run will install all required deps from pypi -# into the venv including monsters like scipy. -# You may want to set it up yourself to speed up the -# process. -# -# This is meant to be run as a cron job under a dedicated -# user account whose HOME directory contains this script. -# a CI directory will be created under it and all files -# stored within it. -# -# The hardcoded dep versions will gradually become obsolete -# You may need to tweak them -# -# @y-p, Jan/2014 - -# disto latex is sometimes finicky. Optionall use -# a local texlive install -export PATH=/mnt/debian/texlive/2013/bin/x86_64-linux:$PATH - -# Having ccache will speed things up -export PATH=/usr/lib64/ccache/:$PATH - -# limit disk usage -ccache -M 200M - -BASEDIR="$HOME/CI" -REPO_URL="https://github.com/pydata/pandas" -REPO_LOC="$BASEDIR/pandas" - -if [ ! -d $BASEDIR ]; then - mkdir -p $BASEDIR - virtualenv $BASEDIR/venv -fi - -source $BASEDIR/venv/bin/activate - -pip install numpy==1.7.2 -pip install cython==0.20.0 -pip install python-dateutil==2.2 -pip install --pre pytz==2013.9 -pip install sphinx==1.1.3 -pip install numexpr==2.2.2 - -pip install matplotlib==1.3.0 -pip install lxml==3.2.5 -pip install beautifulsoup4==4.3.2 -pip install html5lib==0.99 - -# You'll need R as well -pip install rpy2==2.3.9 - -pip install tables==3.0.0 -pip install bottleneck==0.7.0 -pip install ipython==0.13.2 - -# only if you have too -pip install scipy==0.13.2 - -pip install openpyxl==1.6.2 -pip install xlrd==0.9.2 -pip install xlwt==0.7.5 -pip install xlsxwriter==0.5.1 -pip install sqlalchemy==0.8.3 - -if [ ! -d "$REPO_LOC" ]; then - git clone "$REPO_URL" "$REPO_LOC" -fi - -cd "$REPO_LOC" -git reset --hard -git clean -df -git checkout master -git pull origin -make - -source $BASEDIR/venv/bin/activate -export PATH="/usr/lib64/ccache/:$PATH" -pip uninstall pandas -yq -pip install "$REPO_LOC" - -cd "$REPO_LOC"/doc - -python make.py clean -python make.py html -if [ ! $? == 0 ]; then - exit 1 -fi -python make.py zip_html -# usually requires manual intervention -# python make.py latex - -# If you have access: -# python make.py upload_dev diff --git a/ci/lint.sh b/ci/lint.sh index 6b8f160fc90db..a4c960084040f 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -15,7 +15,17 @@ if [ "$LINT" ]; then if [ $? -ne "0" ]; then RET=1 fi + done + echo "Linting DONE" + + echo "Check for invalid testing" + grep -r -E --include '*.py' --exclude nosetester.py --exclude testing.py '(numpy|np)\.testing' pandas + if [ $? = "0" ]; then + RET=1 + fi + echo "Check for invalid testing DONE" + else echo "NOT Linting" fi diff --git a/ci/requirements-3.4.run b/ci/requirements-3.4.run index 7d4cdcd21595a..3e12adae7dd9f 100644 --- a/ci/requirements-3.4.run +++ b/ci/requirements-3.4.run @@ -1,4 +1,4 @@ -pytz +pytz=2015.7 numpy=1.8.1 openpyxl xlsxwriter diff --git a/codecov.yml b/codecov.yml index edf2d821e07e5..45a6040c6a50d 100644 --- a/codecov.yml +++ b/codecov.yml @@ -7,6 +7,3 @@ coverage: default: target: '50' branches: null - changes: - default: - branches: null diff --git a/doc/README.rst b/doc/README.rst index 06d95e6b9c44d..a93ad32a4c8f8 100644 --- a/doc/README.rst +++ b/doc/README.rst @@ -160,7 +160,7 @@ and `Good as first PR `_ where you could start out. -Or maybe you have an idea of you own, by using pandas, looking for something +Or maybe you have an idea of your own, by using pandas, looking for something in the documentation and thinking 'this can be improved', let's do something about that! diff --git a/doc/source/10min.rst b/doc/source/10min.rst index d51290b2a983b..54bcd76855f32 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -483,6 +483,17 @@ SQL style merges. See the :ref:`Database style joining ` right pd.merge(left, right, on='key') +Another example that can be given is: + +.. ipython:: python + + left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]}) + right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]}) + left + right + pd.merge(left, right, on='key') + + Append ~~~~~~ diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 7c7895a95310d..e50e792201d26 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -528,6 +528,13 @@ return a copy of the data rather than a view: jim joe 1 z 0.64094 +Furthermore if you try to index something that is not fully lexsorted, this can raise: + +.. code-block:: ipython + + In [5]: dfm.loc[(0,'y'):(1, 'z')] + KeyError: 'Key length (2) was greater than MultiIndex lexsort depth (1)' + The ``is_lexsorted()`` method on an ``Index`` show if the index is sorted, and the ``lexsort_depth`` property returns the sort depth: .. ipython:: python @@ -542,6 +549,12 @@ The ``is_lexsorted()`` method on an ``Index`` show if the index is sorted, and t dfm.index.is_lexsorted() dfm.index.lexsort_depth +And now selection works as expected. + +.. ipython:: python + + dfm.loc[(0,'y'):(1, 'z')] + Take Methods ------------ diff --git a/doc/source/api.rst b/doc/source/api.rst index 9557867c252ed..0e893308dd935 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -354,6 +354,9 @@ Computations / Descriptive Stats Series.unique Series.nunique Series.is_unique + Series.is_monotonic + Series.is_monotonic_increasing + Series.is_monotonic_decreasing Series.value_counts Reindexing / Selection / Label manipulation @@ -1333,6 +1336,7 @@ Modifying and Computations Index.max Index.reindex Index.repeat + Index.where Index.take Index.putmask Index.set_names diff --git a/doc/source/basics.rst b/doc/source/basics.rst index e3b0915cd571d..917d2f2bb8b04 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1726,6 +1726,28 @@ then the more *general* one will be used as the result of the operation. # conversion of dtypes df3.astype('float32').dtypes +Convert a subset of columns to a specified type using :meth:`~DataFrame.astype` + +.. ipython:: python + + dft = pd.DataFrame({'a': [1,2,3], 'b': [4,5,6], 'c': [7, 8, 9]}) + dft[['a','b']] = dft[['a','b']].astype(np.uint8) + dft + dft.dtypes + +.. note:: + + When trying to convert a subset of columns to a specified type using :meth:`~DataFrame.astype` and :meth:`~DataFrame.loc`, upcasting occurs. + + :meth:`~DataFrame.loc` tries to fit in what we are assigning to the current dtypes, while ``[]`` will overwrite them taking the dtype from the right hand side. Therefore the following piece of code produces the unintended result. + + .. ipython:: python + + dft = pd.DataFrame({'a': [1,2,3], 'b': [4,5,6], 'c': [7, 8, 9]}) + dft.loc[:, ['a', 'b']].astype(np.uint8).dtypes + dft.loc[:, ['a', 'b']] = dft.loc[:, ['a', 'b']].astype(np.uint8) + dft.dtypes + object conversion ~~~~~~~~~~~~~~~~~ diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index e64ff4c155132..a9b86925666b7 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -21,7 +21,7 @@ and `Difficulty Novice `_ where you could start out. -Or maybe through using *pandas* you have an idea of you own or are looking for something +Or maybe through using *pandas* you have an idea of your own or are looking for something in the documentation and thinking 'this can be improved'...you can do something about it! diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index a4db4b7c0d953..685a8690a53d5 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -95,7 +95,7 @@ Plain cython ~~~~~~~~~~~~ First we're going to need to import the cython magic function to ipython (for -cython versions >=0.21 you can use ``%load_ext Cython``): +cython versions < 0.21 you can use ``%load_ext cythonmagic``): .. ipython:: python :okwarning: diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 4cde1fed344a8..02309fe5d6509 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -52,7 +52,7 @@ following: step and try to return a sensibly combined result if it doesn't fit into either of the above two categories -Since the set of object instance method on pandas data structures are generally +Since the set of object instance methods on pandas data structures are generally rich and expressive, we often simply want to invoke, say, a DataFrame function on each group. The name GroupBy should be quite familiar to those who have used a SQL-based tool (or ``itertools``), in which you can write code like: @@ -129,7 +129,7 @@ columns: In [5]: grouped = df.groupby(get_letter_type, axis=1) -Starting with 0.8, pandas Index objects now supports duplicate values. If a +Starting with 0.8, pandas Index objects now support duplicate values. If a non-unique index is used as the group key in a groupby operation, all values for the same index value will be considered to be in one group and thus the output of aggregation functions will only contain unique index values: @@ -171,7 +171,8 @@ By default the group keys are sorted during the ``groupby`` operation. You may h df2.groupby(['X'], sort=False).sum() -Note that ``groupby`` will preserve the order in which *observations* are sorted *within* each group. For example, the groups created by ``groupby()`` below are in the order the appeared in the original ``DataFrame``: +Note that ``groupby`` will preserve the order in which *observations* are sorted *within* each group. +For example, the groups created by ``groupby()`` below are in the order they appeared in the original ``DataFrame``: .. ipython:: python @@ -254,7 +255,7 @@ GroupBy with MultiIndex With :ref:`hierarchically-indexed data `, it's quite natural to group by one of the levels of the hierarchy. -Let's create a series with a two-level ``MultiIndex``. +Let's create a Series with a two-level ``MultiIndex``. .. ipython:: python @@ -636,7 +637,7 @@ with NaNs. dff.groupby('B').filter(lambda x: len(x) > 2, dropna=False) -For dataframes with multiple columns, filters should explicitly specify a column as the filter criterion. +For DataFrames with multiple columns, filters should explicitly specify a column as the filter criterion. .. ipython:: python @@ -755,7 +756,7 @@ The dimension of the returned result can also change: .. note:: - ``apply`` can act as a reducer, transformer, *or* filter function, depending on exactly what is passed to apply. + ``apply`` can act as a reducer, transformer, *or* filter function, depending on exactly what is passed to it. So depending on the path taken, and exactly what you are grouping. Thus the grouped columns(s) may be included in the output as well as set the indices. @@ -789,7 +790,7 @@ Again consider the example DataFrame we've been looking at: df -Supposed we wished to compute the standard deviation grouped by the ``A`` +Suppose we wish to compute the standard deviation grouped by the ``A`` column. There is a slight problem, namely that we don't care about the data in column ``B``. We refer to this as a "nuisance" column. If the passed aggregation function can't be applied to some columns, the troublesome columns @@ -1019,7 +1020,7 @@ Returning a Series to propagate names ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Group DataFrame columns, compute a set of metrics and return a named Series. -The Series name is used as the name for the column index. This is especially +The Series name is used as the name for the column index. This is especially useful in conjunction with reshaping operations such as stacking in which the column index name will be used as the name of the inserted column: diff --git a/doc/source/io.rst b/doc/source/io.rst index cc51fbd1e30ab..f559c3cb3ebaf 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -99,7 +99,7 @@ delimiter : str, default ``None`` Alternative argument name for sep. delim_whitespace : boolean, default False Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) - will be used as the delimiter. Equivalent to setting ``sep='\+s'``. + will be used as the delimiter. Equivalent to setting ``sep='\s+'``. If this option is set to True, nothing should be passed in for the ``delimiter`` parameter. @@ -120,7 +120,8 @@ header : int or list of ints, default ``'infer'`` rather than the first line of the file. names : array-like, default ``None`` List of column names to use. If file contains no header row, then you should - explicitly pass ``header=None``. + explicitly pass ``header=None``. Duplicates in this list are not allowed unless + ``mangle_dupe_cols=True``, which is the default. index_col : int or sequence or ``False``, default ``None`` Column to use as the row labels of the DataFrame. If a sequence is given, a MultiIndex is used. If you have a malformed file with delimiters at the end of @@ -139,6 +140,8 @@ prefix : str, default ``None`` Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ... mangle_dupe_cols : boolean, default ``True`` Duplicate columns will be specified as 'X.0'...'X.N', rather than 'X'...'X'. + Passing in False will cause data to be overwritten if there are duplicate + names in the columns. General Parsing Configuration +++++++++++++++++++++++++++++ @@ -166,6 +169,30 @@ skipfooter : int, default ``0`` Number of lines at bottom of file to skip (unsupported with engine='c'). nrows : int, default ``None`` Number of rows of file to read. Useful for reading pieces of large files. +low_memory : boolean, default ``True`` + Internally process the file in chunks, resulting in lower memory use + while parsing, but possibly mixed type inference. To ensure no mixed + types either set ``False``, or specify the type with the ``dtype`` parameter. + Note that the entire file is read into a single DataFrame regardless, + use the ``chunksize`` or ``iterator`` parameter to return the data in chunks. + (Only valid with C parser) +buffer_lines : int, default None + DEPRECATED: this argument will be removed in a future version because its + value is not respected by the parser + + If ``low_memory`` is ``True``, specify the number of rows to be read for + each chunk. (Only valid with C parser) +compact_ints : boolean, default False + DEPRECATED: this argument will be removed in a future version + + If ``compact_ints`` is ``True``, then for any column that is of integer dtype, the + parser will attempt to cast it as the smallest integer ``dtype`` possible, either + signed or unsigned depending on the specification from the ``use_unsigned`` parameter. +use_unsigned : boolean, default False + DEPRECATED: this argument will be removed in a future version + + If integer columns are being compacted (i.e. ``compact_ints=True``), specify whether + the column should be compacted to the smallest signed or unsigned integer dtype. NA and Missing Data Handling ++++++++++++++++++++++++++++ @@ -252,6 +279,10 @@ quoting : int or ``csv.QUOTE_*`` instance, default ``None`` ``QUOTE_MINIMAL`` (0), ``QUOTE_ALL`` (1), ``QUOTE_NONNUMERIC`` (2) or ``QUOTE_NONE`` (3). Default (``None``) results in ``QUOTE_MINIMAL`` behavior. +doublequote : boolean, default ``True`` + When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, + indicate whether or not to interpret two consecutive ``quotechar`` elements + **inside** a field as a single ``quotechar`` element. escapechar : str (length 1), default ``None`` One-character string used to escape delimiter when quoting is ``QUOTE_NONE``. comment : str, default ``None`` @@ -432,6 +463,42 @@ If the header is in a row other than the first, pass the row number to data = 'skip this skip it\na,b,c\n1,2,3\n4,5,6\n7,8,9' pd.read_csv(StringIO(data), header=1) +.. _io.dupe_names: + +Duplicate names parsing +''''''''''''''''''''''' + +If the file or header contains duplicate names, pandas by default will deduplicate +these names so as to prevent data overwrite: + +.. ipython :: python + + data = 'a,b,a\n0,1,2\n3,4,5' + pd.read_csv(StringIO(data)) + +There is no more duplicate data because ``mangle_dupe_cols=True`` by default, which modifies +a series of duplicate columns 'X'...'X' to become 'X.0'...'X.N'. If ``mangle_dupe_cols +=False``, duplicate data can arise: + +.. code-block :: python + + In [2]: data = 'a,b,a\n0,1,2\n3,4,5' + In [3]: pd.read_csv(StringIO(data), mangle_dupe_cols=False) + Out[3]: + a b a + 0 2 1 2 + 1 5 4 5 + +To prevent users from encountering this problem with duplicate data, a ``ValueError`` +exception is raised if ``mangle_dupe_cols != True``: + +.. code-block :: python + + In [2]: data = 'a,b,a\n0,1,2\n3,4,5' + In [3]: pd.read_csv(StringIO(data), mangle_dupe_cols=False) + ... + ValueError: Setting mangle_dupe_cols=False is not supported yet + .. _io.usecols: Filtering columns (``usecols``) diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 7908428135308..ba675d9aac830 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -562,10 +562,8 @@ DataFrame instance method, with the calling DataFrame being implicitly considered the left object in the join. The related ``DataFrame.join`` method, uses ``merge`` internally for the -index-on-index and index-on-column(s) joins, but *joins on indexes* by default -rather than trying to join on common columns (the default behavior for -``merge``). If you are joining on index, you may wish to use ``DataFrame.join`` -to save yourself some typing. +index-on-index (by default) and column(s)-on-index join. If you are joining on +index only, you may wish to use ``DataFrame.join`` to save yourself some typing. Brief primer on merge methods (relational algebra) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 21765b3f621ce..9ed2c42610b69 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -445,6 +445,16 @@ If ``crosstab`` receives only two Series, it will provide a frequency table. pd.crosstab(df.A, df.B) +Any input passed containing ``Categorical`` data will have **all** of its +categories included in the cross-tabulation, even if the actual data does +not contain any instances of a particular category. + +.. ipython:: python + + foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) + bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f']) + pd.crosstab(foo, bar) + Normalization ~~~~~~~~~~~~~ diff --git a/doc/source/text.rst b/doc/source/text.rst index 16b16a320f75b..3822c713d7f85 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -281,7 +281,7 @@ Unlike ``extract`` (which returns only the first match), .. ipython:: python - s = pd.Series(["a1a2", "b1", "c1"], ["A", "B", "C"]) + s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"]) s two_groups = '(?P[a-z])(?P[0-9])' s.str.extract(two_groups, expand=True) @@ -313,6 +313,17 @@ then ``extractall(pat).xs(0, level='match')`` gives the same result as extractall_result extractall_result.xs(0, level="match") +``Index`` also supports ``.str.extractall``. It returns a ``DataFrame`` which has the +same result as a ``Series.str.extractall`` with a default index (starts from 0). + +.. versionadded:: 0.18.2 + +.. ipython:: python + + pd.Index(["a1a2", "b1", "c1"]).str.extractall(two_groups) + + pd.Series(["a1a2", "b1", "c1"]).str.extractall(two_groups) + Testing for Strings that Match or Contain a Pattern --------------------------------------------------- diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 114607f117756..62601821488d3 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -98,6 +98,7 @@ time. pd.Timestamp(datetime(2012, 5, 1)) pd.Timestamp('2012-05-01') + pd.Timestamp(2012, 5, 1) However, in many cases it is more natural to associate things like change variables with a time span instead. The span represented by ``Period`` can be diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 7f837bef5251c..51982c42499ff 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -563,7 +563,6 @@ Performance Improvements - Improved speed of SAS reader (:issue:`12656`, :issue:`12961`) - Performance improvements in ``.groupby(..).cumcount()`` (:issue:`11039`) - Improved memory usage in ``pd.read_csv()`` when using ``skiprows=an_integer`` (:issue:`13005`) - - Improved performance of ``DataFrame.to_sql`` when checking case sensitivity for tables. Now only checks if table has been created correctly when table name is not lower case. (:issue:`12876`) - Improved performance of ``Period`` construction and time series plotting (:issue:`12903`, :issue:`11831`). - Improved performance of ``.str.encode()`` and ``.str.decode()`` methods (:issue:`13008`) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index fa426aa30bc65..6829afa2b36b8 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -19,10 +19,37 @@ Highlights include: New features ~~~~~~~~~~~~ +.. _whatsnew_0182.enhancements.read_csv_dupe_col_names_support: +``pd.read_csv`` has improved support for duplicate column names +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:ref:`Duplicate column names ` are now supported in ``pd.read_csv()`` whether +they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :issue:`9424`) +.. ipython :: python + data = '0,1,2\n3,4,5' + names = ['a', 'b', 'a'] + +Previous behaviour: + +.. code-block:: ipython + + In [2]: pd.read_csv(StringIO(data), names=names) + Out[2]: + a b a + 0 2 1 2 + 1 5 4 5 + +The first 'a' column contains the same data as the second 'a' column, when it should have +contained the array ``[0, 3]``. + +New behaviour: + +.. ipython :: python + + In [2]: pd.read_csv(StringIO(data), names=names) .. _whatsnew_0182.enhancements.other: @@ -30,9 +57,38 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`) +- ``astype()`` will now accept a dict of column name to data types mapping as the ``dtype`` argument. (:issue:`12086`) +- ``.to_hdf/read_hdf()`` now accept path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path (:issue:`11773`) + + .. ipython:: python + + idx = pd.Index(["a1a2", "b1", "c1"]) + idx.str.extractall("[ab](?P\d)") + +- ``Timestamp`` s can now accept positional and keyword parameters like :func:`datetime.datetime` (:issue:`10758`, :issue:`11630`) + .. ipython:: python + pd.Timestamp(2012, 1, 1) + pd.Timestamp(year=2012, month=1, day=1, hour=8, minute=30) + +- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`) +- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`) + +- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) +- ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`) + .. ipython:: python + + idx = pd.Index(['a', 'b', 'c']) + idx.where([True, False, True]) +- ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`) +- Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`) +- The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`) +- ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) + +- ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules. New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`) +- ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) .. _whatsnew_0182.api: @@ -41,7 +97,8 @@ API changes - Non-convertible dates in an excel date column will be returned without conversion and the column will be ``object`` dtype, rather than raising an exception (:issue:`10001`) - +- An ``UnsupportedFunctionCall`` error is now raised if NumPy ufuncs like ``np.mean`` are called on groupby or resample objects (:issue:`12811`) +- Calls to ``.sample()`` will respect the random seed set via ``numpy.random.seed(n)`` (:issue:`13161`) .. _whatsnew_0182.api.tolist: @@ -70,25 +127,170 @@ New Behavior: type(s.tolist()[0]) +.. _whatsnew_0182.api.promote: + +``Series`` type promotion on assignment +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A ``Series`` will now correctly promote its dtype for assignment with incompat values to the current dtype (:issue:`13234`) + + +.. ipython:: python + + s = pd.Series() + +Previous Behavior: + +.. code-block:: ipython + + In [2]: s["a"] = pd.Timestamp("2016-01-01") + + In [3]: s["b"] = 3.0 + TypeError: invalid type promotion + +New Behavior: + +.. ipython:: python + + s["a"] = pd.Timestamp("2016-01-01") + s["b"] = 3.0 + s + s.dtype + +.. _whatsnew_0182.api.to_datetime_coerce: + +``.to_datetime()`` when coercing +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A bug is fixed in ``.to_datetime()`` when passing integers or floats, and no ``unit`` and ``errors='coerce'`` (:issue:`13180`). +Previously if ``.to_datetime()`` encountered mixed integers/floats and strings, but no datetimes with ``errors='coerce'`` it would convert all to ``NaT``. + +Previous Behavior: + +.. code-block:: ipython + + In [2]: pd.to_datetime([1, 'foo'], errors='coerce') + Out[2]: DatetimeIndex(['NaT', 'NaT'], dtype='datetime64[ns]', freq=None) + +This will now convert integers/floats with the default unit of ``ns``. + +.. ipython:: python + + pd.to_datetime([1, 'foo'], errors='coerce') + +.. _whatsnew_0182.api.merging: + +Merging changes +^^^^^^^^^^^^^^^ + +Merging will now preserve the dtype of the join keys (:issue:`8596`) + +.. ipython:: python + + df1 = pd.DataFrame({'key': [1], 'v1': [10]}) + df1 + df2 = pd.DataFrame({'key': [1, 2], 'v1': [20, 30]}) + df2 + +Previous Behavior: + +.. code-block:: ipython + + In [5]: pd.merge(df1, df2, how='outer') + Out[5]: + key v1 + 0 1.0 10.0 + 1 1.0 20.0 + 2 2.0 30.0 + + In [6]: pd.merge(df1, df2, how='outer').dtypes + Out[6]: + key float64 + v1 float64 + dtype: object + +New Behavior: + +We are able to preserve the join keys + +.. ipython:: python + + pd.merge(df1, df2, how='outer') + pd.merge(df1, df2, how='outer').dtypes + +Of course if you have missing values that are introduced, then the +resulting dtype will be upcast (unchanged from previous). + +.. ipython:: python + + pd.merge(df1, df2, how='outer', on='key') + pd.merge(df1, df2, how='outer', on='key').dtypes + +.. _whatsnew_0182.describe: + +``.describe()`` changes +^^^^^^^^^^^^^^^^^^^^^^^ +Percentile identifiers in the index of a ``.describe()`` output will now be rounded to the least precision that keeps them distinct (:issue:`13104`) +.. ipython:: python + s = pd.Series([0, 1, 2, 3, 4]) + df = pd.DataFrame([0, 1, 2, 3, 4]) +Previous Behavior: +The percentiles were rounded to at most one decimal place, which could raise ``ValueError`` for a data frame if the percentiles were duplicated. +.. code-block:: ipython + + In [3]: s.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) + Out[3]: + count 5.000000 + mean 2.000000 + std 1.581139 + min 0.000000 + 0.0% 0.000400 + 0.1% 0.002000 + 0.1% 0.004000 + 50% 2.000000 + 99.9% 3.996000 + 100.0% 3.998000 + 100.0% 3.999600 + max 4.000000 + dtype: float64 + + In [4]: df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) + Out[4]: + ... + ValueError: cannot reindex from a duplicate axis + +New Behavior: + +.. ipython:: python + s.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) + df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) +- Passing duplicated ``percentiles`` will now raise a ``ValueError``. +- Bug in ``.describe()`` on a DataFrame with a mixed-dtype column index, which would previously raise a ``TypeError`` (:issue:`13288`) .. _whatsnew_0182.api.other: Other API changes ^^^^^^^^^^^^^^^^^ +- ``Float64Index.astype(int)`` will now raise ``ValueError`` if ``Float64Index`` contains ``NaN`` values (:issue:`13149`) +- ``TimedeltaIndex.astype(int)`` and ``DatetimeIndex.astype(int)`` will now return ``Int64Index`` instead of ``np.array`` (:issue:`13209`) +- ``.filter()`` enforces mutual exclusion of the keyword arguments. (:issue:`12399`) + .. _whatsnew_0182.deprecations: Deprecations ^^^^^^^^^^^^ +- ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13320`) +- ``buffer_lines`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13360`) .. _whatsnew_0182.performance: @@ -97,38 +299,79 @@ Performance Improvements - Improved performance of sparse ``IntIndex.intersect`` (:issue:`13082`) - Improved performance of sparse arithmetic with ``BlockIndex`` when the number of blocks are large, though recommended to use ``IntIndex`` in such cases (:issue:`13082`) +- increased performance of ``DataFrame.quantile()`` as it now operates per-block (:issue:`11623`) + +- Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`) .. _whatsnew_0182.bug_fixes: Bug Fixes ~~~~~~~~~ + +- Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`) +- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) +- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`) - Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`) +- Bug in ``SparseSeries`` and ``SparseDataFrame`` creation with ``object`` dtype may raise ``TypeError`` (:issue:`11633`) +- Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) +- Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`) +- Bug in selection from a ``HDFStore`` with a fixed format and ``start`` and/or ``stop`` specified will now return the selected range (:issue:`8287`) +- Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`) +- Bug in ``.to_records()`` when index name is a unicode string (:issue:`13172`) +- Bug in calling ``.memory_usage()`` on object which doesn't implement (:issue:`12924`) +- Regression in ``Series.quantile`` with nans (also shows up in ``.median()`` and ``.describe()``); furthermore now names the ``Series`` with the quantile (:issue:`13098`, :issue:`13146`) +- Bug in ``SeriesGroupBy.transform`` with datetime values and missing groups (:issue:`13191`) +- Bug in ``Series.str.extractall()`` with ``str`` index raises ``ValueError`` (:issue:`13156`) +- Bug in ``PeriodIndex`` and ``Period`` subtraction raises ``AttributeError`` (:issue:`13071`) +- Bug in ``PeriodIndex`` construction returning a ``float64`` index in some circumstances (:issue:`13067`) +- Bug in ``.resample(..)`` with a ``PeriodIndex`` not changing its ``freq`` appropriately when empty (:issue:`13067`) +- Bug in ``.resample(..)`` with a ``PeriodIndex`` not retaining its type or name with an empty ``DataFrame``appropriately when empty (:issue:`13212`) +- Bug in ``groupby(..).resample(..)`` where passing some keywords would raise an exception (:issue:`13235`) +- Bug in ``.tz_convert`` on a tz-aware ``DateTimeIndex`` that relied on index being sorted for correct results (:issue: `13306`) +- Bug in ``pd.read_hdf()`` where attempting to load an HDF file with a single dataset, that had one or more categorical columns, failed unless the key argument was set to the name of the dataset. (:issue:`13231`) -- Bug in ``PeriodIndex`` and ``Period`` subtraction raises ``AttributeError`` (:issue:`13071`) +- Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`) +- Bug in ``pd.read_csv()`` with ``engine='python'`` in which ``NaN`` values weren't being detected after data was converted to numeric values (:issue:`13314`) +- Bug in ``pd.read_csv()`` in which the ``nrows`` argument was not properly validated for both engines (:issue:`10476`) +- Bug in ``pd.read_csv()`` with ``engine='python'`` in which infinities of mixed-case forms were not being interpreted properly (:issue:`13274`) +- Bug in ``pd.read_csv()`` with ``engine='python'`` in which trailing ``NaN`` values were not being parsed (:issue:`13320`) +- Bug in ``pd.read_csv()`` that prevents ``usecols`` kwarg from accepting single-byte unicode strings (:issue:`13219`) +- Bug in ``Series`` arithmetic raises ``TypeError`` if it contains datetime-like as ``object`` dtype (:issue:`13043`) +- Bug in ``pd.to_datetime()`` when passing invalid datatypes (e.g. bool); will now respect the ``errors`` keyword (:issue:`13176`) +- Bug in extension dtype creation where the created types were not is/identical (:issue:`13285`) +- Bug in ``NaT`` - ``Period`` raises ``AttributeError`` (:issue:`13071`) +- Bug in ``Period`` addition raises ``TypeError`` if ``Period`` is on right hand side (:issue:`13069`) +- Bug in ``Peirod`` and ``Series`` or ``Index`` comparison raises ``TypeError`` (:issue:`13200`) +- Bug in ``pd.set_eng_float_format()`` that would prevent NaN's from formatting (:issue:`11981`) +- Bug in ``.unstack`` with ``Categorical`` dtype resets ``.ordered`` to ``True`` (:issue:`13249`) +- Bug in ``Series`` comparison operators when dealing with zero dim NumPy arrays (:issue:`13006`) +- Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) -- Bug in ``NaT`` - ``Period`` raises ``AttributeError`` (:issue:`13071`) -- Bug in ``Period`` addition raises ``TypeError`` if ``Period`` is on right hand side (:issue:`13069`) +- Bug in ``pd.to_numeric`` when ``errors='coerce'`` and input contains non-hashable objects (:issue:`13324`) + + +- Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt new file mode 100644 index 0000000000000..42db0388ca5d9 --- /dev/null +++ b/doc/source/whatsnew/v0.19.0.txt @@ -0,0 +1,83 @@ +.. _whatsnew_0190: + +v0.19.0 (????, 2016) +-------------------- + +This is a major release from 0.18.2 and includes a small number of API changes, several new features, +enhancements, and performance improvements along with a large number of bug fixes. We recommend that all +users upgrade to this version. + +Highlights include: + + +Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. + +.. contents:: What's new in v0.19.0 + :local: + :backlinks: none + +.. _whatsnew_0190.enhancements: + +New features +~~~~~~~~~~~~ + + + + + +.. _whatsnew_0190.enhancements.other: + +Other enhancements +^^^^^^^^^^^^^^^^^^ + + + + + + +.. _whatsnew_0190.api_breaking: + +Backwards incompatible API changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _whatsnew_0190.api: + + + + + + +Other API Changes +^^^^^^^^^^^^^^^^^ + +.. _whatsnew_0190.deprecations: + +Deprecations +^^^^^^^^^^^^ + + + + + +.. _whatsnew_0190.prior_deprecations: + +Removal of prior version deprecations/changes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + + + + +.. _whatsnew_0190.performance: + +Performance Improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + + + + + +.. _whatsnew_0190.bug_fixes: + +Bug Fixes +~~~~~~~~~ diff --git a/pandas/algos.pyx b/pandas/algos.pyx index a31b35ba4afc6..7884d9c41845c 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -1505,52 +1505,8 @@ def roll_kurt(ndarray[double_t] input, #------------------------------------------------------------------------------- # Rolling median, min, max -ctypedef double_t (* skiplist_f)(object sl, int n, int p) - -cdef _roll_skiplist_op(ndarray arg, int win, int minp, skiplist_f op): - cdef ndarray[double_t] input = arg - cdef double val, prev, midpoint - cdef IndexableSkiplist skiplist - cdef Py_ssize_t nobs = 0, i - - cdef Py_ssize_t N = len(input) - cdef ndarray[double_t] output = np.empty(N, dtype=float) - - skiplist = IndexableSkiplist(win) - - minp = _check_minp(win, minp, N) - - for i from 0 <= i < minp - 1: - val = input[i] - - # Not NaN - if val == val: - nobs += 1 - skiplist.insert(val) - - output[i] = NaN - - for i from minp - 1 <= i < N: - val = input[i] - - if i > win - 1: - prev = input[i - win] - - if prev == prev: - skiplist.remove(prev) - nobs -= 1 - - if val == val: - nobs += 1 - skiplist.insert(val) - - output[i] = op(skiplist, nobs, minp) - - return output - from skiplist cimport * - @cython.boundscheck(False) @cython.wraparound(False) def roll_median_c(ndarray[float64_t] arg, int win, int minp): diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index 069cb3638fe75..274761f5d0b9c 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -21,7 +21,7 @@ from numpy import ndarray from pandas.util.validators import (validate_args, validate_kwargs, validate_args_and_kwargs) -from pandas.core.common import is_integer +from pandas.core.common import is_integer, UnsupportedFunctionCall from pandas.compat import OrderedDict @@ -245,3 +245,77 @@ def validate_transpose_for_generic(inst, kwargs): msg += " for {klass} instances".format(klass=klass) raise ValueError(msg) + + +def validate_window_func(name, args, kwargs): + numpy_args = ('axis', 'dtype', 'out') + msg = ("numpy operations are not " + "valid with window objects. " + "Use .{func}() directly instead ".format(func=name)) + + if len(args) > 0: + raise UnsupportedFunctionCall(msg) + + for arg in numpy_args: + if arg in kwargs: + raise UnsupportedFunctionCall(msg) + + +def validate_rolling_func(name, args, kwargs): + numpy_args = ('axis', 'dtype', 'out') + msg = ("numpy operations are not " + "valid with window objects. " + "Use .rolling(...).{func}() instead ".format(func=name)) + + if len(args) > 0: + raise UnsupportedFunctionCall(msg) + + for arg in numpy_args: + if arg in kwargs: + raise UnsupportedFunctionCall(msg) + + +def validate_expanding_func(name, args, kwargs): + numpy_args = ('axis', 'dtype', 'out') + msg = ("numpy operations are not " + "valid with window objects. " + "Use .expanding(...).{func}() instead ".format(func=name)) + + if len(args) > 0: + raise UnsupportedFunctionCall(msg) + + for arg in numpy_args: + if arg in kwargs: + raise UnsupportedFunctionCall(msg) + + +def validate_groupby_func(name, args, kwargs): + """ + 'args' and 'kwargs' should be empty because all of + their necessary parameters are explicitly listed in + the function signature + """ + if len(args) + len(kwargs) > 0: + raise UnsupportedFunctionCall(( + "numpy operations are not valid " + "with groupby. Use .groupby(...)." + "{func}() instead".format(func=name))) + +RESAMPLER_NUMPY_OPS = ('min', 'max', 'sum', 'prod', + 'mean', 'std', 'var') + + +def validate_resampler_func(method, args, kwargs): + """ + 'args' and 'kwargs' should be empty because all of + their necessary parameters are explicitly listed in + the function signature + """ + if len(args) + len(kwargs) > 0: + if method in RESAMPLER_NUMPY_OPS: + raise UnsupportedFunctionCall(( + "numpy operations are not valid " + "with resample. Use .resample(...)." + "{func}() instead".format(func=method))) + else: + raise TypeError("too many arguments passed in") diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 01d0fa664ac41..f1cf210754d12 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -5,6 +5,7 @@ import tokenize from functools import partial +import numpy as np import pandas as pd from pandas import compat @@ -356,6 +357,19 @@ def _possibly_transform_eq_ne(self, node, left=None, right=None): right) return op, op_class, left, right + def _possibly_downcast_constants(self, left, right): + f32 = np.dtype(np.float32) + if left.isscalar and not right.isscalar and right.return_type == f32: + # right is a float32 array, left is a scalar + name = self.env.add_tmp(np.float32(left.value)) + left = self.term_type(name, self.env) + if right.isscalar and not left.isscalar and left.return_type == f32: + # left is a float32 array, right is a scalar + name = self.env.add_tmp(np.float32(right.value)) + right = self.term_type(name, self.env) + + return left, right + def _possibly_eval(self, binop, eval_in_python): # eval `in` and `not in` (for now) in "partial" python space # things that can be evaluated in "eval" space will be turned into @@ -399,6 +413,7 @@ def _possibly_evaluate_binop(self, op, op_class, lhs, rhs, def visit_BinOp(self, node, **kwargs): op, op_class, left, right = self._possibly_transform_eq_ne(node) + left, right = self._possibly_downcast_constants(left, right) return self._possibly_evaluate_binop(op, op_class, left, right) def visit_Div(self, node, **kwargs): diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 603c030dcaa6e..bf6fa35cf255f 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -276,18 +276,26 @@ def _not_in(x, y): _binary_ops_dict.update(d) -def _cast_inplace(terms, dtype): +def _cast_inplace(terms, acceptable_dtypes, dtype): """Cast an expression inplace. Parameters ---------- terms : Op The expression that should cast. + acceptable_dtypes : list of acceptable numpy.dtype + Will not cast if term's dtype in this list. + + .. versionadded:: 0.18.2 + dtype : str or numpy.dtype The dtype to cast to. """ dt = np.dtype(dtype) for term in terms: + if term.type in acceptable_dtypes: + continue + try: new_value = term.value.astype(dt) except AttributeError: @@ -452,7 +460,9 @@ def __init__(self, lhs, rhs, truediv, *args, **kwargs): rhs.return_type)) if truediv or PY3: - _cast_inplace(com.flatten(self), np.float_) + # do not upcast float32s to float64 un-necessarily + acceptable_dtypes = [np.float32, np.float_] + _cast_inplace(com.flatten(self), acceptable_dtypes, np.float_) _unary_ops_syms = '+', '-', '~', 'not' diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 143e6017b462a..5019dd392a567 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -12,8 +12,6 @@ from numpy.random import randn, rand, randint import numpy as np -from numpy.testing import assert_allclose -from numpy.testing.decorators import slow import pandas as pd from pandas.core import common as com @@ -33,7 +31,8 @@ import pandas.lib as lib from pandas.util.testing import (assert_frame_equal, randbool, assertRaisesRegexp, assert_numpy_array_equal, - assert_produces_warning, assert_series_equal) + assert_produces_warning, assert_series_equal, + slow) from pandas.compat import PY3, u, reduce _series_frame_incompatible = _bool_ops_syms @@ -186,6 +185,16 @@ def test_chained_cmp_op(self): mids, cmp_ops, self.rhses): self.check_chained_cmp_op(lhs, cmp1, mid, cmp2, rhs) + def check_equal(self, result, expected): + if isinstance(result, DataFrame): + tm.assert_frame_equal(result, expected) + elif isinstance(result, Series): + tm.assert_series_equal(result, expected) + elif isinstance(result, np.ndarray): + tm.assert_numpy_array_equal(result, expected) + else: + self.assertEqual(result, expected) + def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2): skip_these = _scalar_skip ex = '(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)'.format(cmp1=cmp1, @@ -219,7 +228,7 @@ def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2): expected = _eval_single_bin( lhs_new, binop, rhs_new, self.engine) result = pd.eval(ex, engine=self.engine, parser=self.parser) - tm.assert_numpy_array_equal(result, expected) + self.check_equal(result, expected) def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): skip_these = _scalar_skip @@ -239,7 +248,8 @@ def check_operands(left, right, cmp_op): for ex in (ex1, ex2, ex3): result = pd.eval(ex, engine=self.engine, parser=self.parser) - tm.assert_numpy_array_equal(result, expected) + + tm.assert_almost_equal(result, expected) def check_simple_cmp_op(self, lhs, cmp1, rhs): ex = 'lhs {0} rhs'.format(cmp1) @@ -250,13 +260,14 @@ def check_simple_cmp_op(self, lhs, cmp1, rhs): else: expected = _eval_single_bin(lhs, cmp1, rhs, self.engine) result = pd.eval(ex, engine=self.engine, parser=self.parser) - tm.assert_numpy_array_equal(result, expected) + self.check_equal(result, expected) def check_binary_arith_op(self, lhs, arith1, rhs): ex = 'lhs {0} rhs'.format(arith1) result = pd.eval(ex, engine=self.engine, parser=self.parser) expected = _eval_single_bin(lhs, arith1, rhs, self.engine) - tm.assert_numpy_array_equal(result, expected) + + tm.assert_almost_equal(result, expected) ex = 'lhs {0} rhs {0} rhs'.format(arith1) result = pd.eval(ex, engine=self.engine, parser=self.parser) nlhs = _eval_single_bin(lhs, arith1, rhs, @@ -271,8 +282,10 @@ def check_alignment(self, result, nlhs, ghs, op): # TypeError, AttributeError: series or frame with scalar align pass else: + + # direct numpy comparison expected = self.ne.evaluate('nlhs {0} ghs'.format(op)) - tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result.values, expected) # modulus, pow, and floor division require special casing @@ -280,9 +293,13 @@ def check_modulus(self, lhs, arith1, rhs): ex = 'lhs {0} rhs'.format(arith1) result = pd.eval(ex, engine=self.engine, parser=self.parser) expected = lhs % rhs - assert_allclose(result, expected) + + tm.assert_almost_equal(result, expected) expected = self.ne.evaluate('expected {0} rhs'.format(arith1)) - assert_allclose(result, expected) + if isinstance(result, (DataFrame, Series)): + tm.assert_almost_equal(result.values, expected) + else: + tm.assert_almost_equal(result, expected.item()) def check_floor_division(self, lhs, arith1, rhs): ex = 'lhs {0} rhs'.format(arith1) @@ -290,7 +307,7 @@ def check_floor_division(self, lhs, arith1, rhs): if self.engine == 'python': res = pd.eval(ex, engine=self.engine, parser=self.parser) expected = lhs // rhs - tm.assert_numpy_array_equal(res, expected) + self.check_equal(res, expected) else: self.assertRaises(TypeError, pd.eval, ex, local_dict={'lhs': lhs, 'rhs': rhs}, @@ -319,13 +336,13 @@ def check_pow(self, lhs, arith1, rhs): self.assertRaises(AssertionError, tm.assert_numpy_array_equal, result, expected) else: - assert_allclose(result, expected) + tm.assert_almost_equal(result, expected) ex = '(lhs {0} rhs) {0} rhs'.format(arith1) result = pd.eval(ex, engine=self.engine, parser=self.parser) expected = self.get_expected_pow_result( self.get_expected_pow_result(lhs, rhs), rhs) - assert_allclose(result, expected) + tm.assert_almost_equal(result, expected) def check_single_invert_op(self, lhs, cmp1, rhs): # simple @@ -336,12 +353,12 @@ def check_single_invert_op(self, lhs, cmp1, rhs): elb = np.array([bool(el)]) expected = ~elb result = pd.eval('~elb', engine=self.engine, parser=self.parser) - tm.assert_numpy_array_equal(expected, result) + tm.assert_almost_equal(expected, result) for engine in self.current_engines: tm.skip_if_no_ne(engine) - tm.assert_numpy_array_equal(result, pd.eval('~elb', engine=engine, - parser=self.parser)) + tm.assert_almost_equal(result, pd.eval('~elb', engine=engine, + parser=self.parser)) def check_compound_invert_op(self, lhs, cmp1, rhs): skip_these = 'in', 'not in' @@ -361,13 +378,13 @@ def check_compound_invert_op(self, lhs, cmp1, rhs): else: expected = ~expected result = pd.eval(ex, engine=self.engine, parser=self.parser) - tm.assert_numpy_array_equal(expected, result) + tm.assert_almost_equal(expected, result) # make sure the other engines work the same as this one for engine in self.current_engines: tm.skip_if_no_ne(engine) ev = pd.eval(ex, engine=self.engine, parser=self.parser) - tm.assert_numpy_array_equal(ev, result) + tm.assert_almost_equal(ev, result) def ex(self, op, var_name='lhs'): return '{0}{1}'.format(op, var_name) @@ -701,10 +718,10 @@ def check_modulus(self, lhs, arith1, rhs): result = pd.eval(ex, engine=self.engine, parser=self.parser) expected = lhs % rhs - assert_allclose(result, expected) + tm.assert_almost_equal(result, expected) expected = _eval_single_bin(expected, arith1, rhs, self.engine) - assert_allclose(result, expected) + tm.assert_almost_equal(result, expected) def check_alignment(self, result, nlhs, ghs, op): try: @@ -715,7 +732,7 @@ def check_alignment(self, result, nlhs, ghs, op): pass else: expected = eval('nlhs {0} ghs'.format(op)) - tm.assert_numpy_array_equal(result, expected) + tm.assert_almost_equal(result, expected) class TestEvalPythonPandas(TestEvalPythonPython): @@ -736,6 +753,35 @@ def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): ENGINES_PARSERS = list(product(_engines, expr._parsers)) +#------------------------------------- +# typecasting rules consistency with python +# issue #12388 + +class TestTypeCasting(tm.TestCase): + + def check_binop_typecasting(self, engine, parser, op, dt): + tm.skip_if_no_ne(engine) + df = mkdf(5, 3, data_gen_f=f, dtype=dt) + s = 'df {} 3'.format(op) + res = pd.eval(s, engine=engine, parser=parser) + self.assertTrue(df.values.dtype == dt) + self.assertTrue(res.values.dtype == dt) + assert_frame_equal(res, eval(s)) + + s = '3 {} df'.format(op) + res = pd.eval(s, engine=engine, parser=parser) + self.assertTrue(df.values.dtype == dt) + self.assertTrue(res.values.dtype == dt) + assert_frame_equal(res, eval(s)) + + def test_binop_typecasting(self): + for engine, parser in ENGINES_PARSERS: + for op in ['+', '-', '*', '**', '/']: + # maybe someday... numexpr has too many upcasting rules now + #for dt in chain(*(np.sctypes[x] for x in ['uint', 'int', 'float'])): + for dt in [np.float32, np.float64]: + yield self.check_binop_typecasting, engine, parser, op, dt + #------------------------------------- # basic and complex alignment @@ -1578,7 +1624,7 @@ def test_binary_functions(self): expr = "{0}(a, b)".format(fn) got = self.eval(expr) expect = getattr(np, fn)(a, b) - np.testing.assert_allclose(got, expect) + tm.assert_almost_equal(got, expect, check_names=False) def test_df_use_case(self): df = DataFrame({'a': np.random.randn(10), diff --git a/pandas/core/base.py b/pandas/core/base.py index 1a812ba2e4878..96732a7140f9e 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -127,7 +127,7 @@ def __sizeof__(self): # no memory_usage attribute, so fall back to # object's 'sizeof' - return super(self, PandasObject).__sizeof__() + return super(PandasObject, self).__sizeof__() class NoNewAttributesMixin(object): @@ -995,6 +995,37 @@ def is_unique(self): """ return self.nunique() == len(self) + @property + def is_monotonic(self): + """ + Return boolean if values in the object are + monotonic_increasing + + .. versionadded:: 0.18.2 + + Returns + ------- + is_monotonic : boolean + """ + from pandas import Index + return Index(self).is_monotonic + is_monotonic_increasing = is_monotonic + + @property + def is_monotonic_decreasing(self): + """ + Return boolean if values in the object are + monotonic_decreasing + + .. versionadded:: 0.18.2 + + Returns + ------- + is_monotonic_decreasing : boolean + """ + from pandas import Index + return Index(self).is_monotonic_decreasing + def memory_usage(self, deep=False): """ Memory usage of my values diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 4f80c610c1126..fa3d13c174245 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -336,11 +336,26 @@ def copy(self): categories=self.categories, ordered=self.ordered, fastpath=True) - def astype(self, dtype): - """ coerce this type to another dtype """ + def astype(self, dtype, copy=True): + """ + Coerce this type to another dtype + + Parameters + ---------- + dtype : numpy dtype or pandas type + copy : bool, default True + By default, astype always returns a newly allocated object. + If copy is set to False and dtype is categorical, the original + object is returned. + + .. versionadded:: 0.18.2 + + """ if is_categorical_dtype(dtype): + if copy is True: + return self.copy() return self - return np.array(self, dtype=dtype) + return np.array(self, dtype=dtype, copy=copy) @cache_readonly def ndim(self): @@ -883,8 +898,8 @@ def remove_unused_categories(self, inplace=False): if idx.size != 0 and idx[0] == -1: # na sentinel idx, inv = idx[1:], inv - 1 - cat._codes = inv cat._categories = cat.categories.take(idx) + cat._codes = _coerce_indexer_dtype(inv, self._categories) if not inplace: return cat @@ -985,7 +1000,7 @@ def __setstate__(self, state): # Provide compatibility with pre-0.15.0 Categoricals. if '_codes' not in state and 'labels' in state: - state['_codes'] = state.pop('labels') + state['_codes'] = state.pop('labels').astype(np.int8) if '_categories' not in state and '_levels' in state: state['_categories'] = self._validate_categories(state.pop( '_levels')) diff --git a/pandas/core/common.py b/pandas/core/common.py index c64cfa77b9e62..d26c59e62de30 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -41,6 +41,10 @@ class AmbiguousIndexError(PandasError, KeyError): pass +class UnsupportedFunctionCall(ValueError): + pass + + class AbstractMethodError(NotImplementedError): """Raise this error instead of NotImplementedError for abstract methods while keeping compatibility with Python 2 and Python 3. @@ -138,7 +142,7 @@ def _isnull_old(obj): def _use_inf_as_null(key): """Option change callback for null/inf behaviour - Choose which replacement for numpy.isnan / -numpy.isfinite is used. + Choose which replacement for numpy.isnan / ~numpy.isfinite is used. Parameters ---------- @@ -229,7 +233,7 @@ def _isnull_ndarraylike_old(obj): def notnull(obj): - """Replacement for numpy.isfinite / -numpy.isnan which is suitable for use + """Replacement for numpy.isfinite / ~numpy.isnan which is suitable for use on object arrays. Parameters @@ -312,8 +316,8 @@ def array_equivalent(left, right, strict_nan=False): if not strict_nan: # pd.isnull considers NaN and None to be equivalent. - return lib.array_equivalent_object( - _ensure_object(left.ravel()), _ensure_object(right.ravel())) + return lib.array_equivalent_object(_ensure_object(left.ravel()), + _ensure_object(right.ravel())) for left_value, right_value in zip(left, right): if left_value is tslib.NaT and right_value is not tslib.NaT: @@ -1111,7 +1115,7 @@ def _possibly_cast_to_datetime(value, dtype, errors='raise'): def _possibly_infer_to_datetimelike(value, convert_dates=False): """ - we might have a array (or single object) that is datetime like, + we might have an array (or single object) that is datetime like, and no dtype is passed don't change the value unless we find a datetime/timedelta set @@ -1596,7 +1600,7 @@ def is_timedelta64_dtype(arr_or_dtype): def is_timedelta64_ns_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) + tipo = _get_dtype(arr_or_dtype) return tipo == _TD_DTYPE @@ -2058,7 +2062,7 @@ def _random_state(state=None): state : int, np.random.RandomState, None. If receives an int, passes to np.random.RandomState() as seed. If receives an np.random.RandomState object, just returns object. - If receives `None`, returns an np.random.RandomState object. + If receives `None`, returns np.random. If receives anything else, raises an informative ValueError. Default None. @@ -2072,7 +2076,7 @@ def _random_state(state=None): elif isinstance(state, np.random.RandomState): return state elif state is None: - return np.random.RandomState() + return np.random else: raise ValueError("random_state must be an integer, a numpy " "RandomState, or None") diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b209b6d6ec543..69def7502a6f7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1062,7 +1062,7 @@ def to_records(self, index=True, convert_datetime64=True): count += 1 elif index_names[0] is None: index_names = ['index'] - names = index_names + lmap(str, self.columns) + names = lmap(str, index_names) + lmap(str, self.columns) else: arrays = [self[c].get_values() for c in self.columns] names = lmap(str, self.columns) @@ -4351,18 +4351,20 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='', Series is passed, its name attribute must be set, and that will be used as the column name in the resulting joined DataFrame on : column name, tuple/list of column names, or array-like - Column(s) to use for joining, otherwise join on index. If multiples + Column(s) in the caller to join on the index in other, + otherwise joins index-on-index. If multiples columns given, the passed DataFrame must have a MultiIndex. Can pass an array as the join key if not already contained in the calling DataFrame. Like an Excel VLOOKUP operation - how : {'left', 'right', 'outer', 'inner'} - How to handle indexes of the two objects. Default: 'left' - for joining on index, None otherwise - - * left: use calling frame's index - * right: use input frame's index - * outer: form union of indexes - * inner: use intersection of indexes + how : {'left', 'right', 'outer', 'inner'}, default: 'left' + How to handle the operation of the two objects. + + * left: use calling frame's index (or column if on is specified) + * right: use other frame's index + * outer: form union of calling frame's index (or column if on is + specified) with other frame's index + * inner: form intersection of calling frame's index (or column if + on is specified) with other frame's index lsuffix : string Suffix to use from left frame's overlapping columns rsuffix : string @@ -4376,6 +4378,77 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='', on, lsuffix, and rsuffix options are not supported when passing a list of DataFrame objects + Examples + -------- + >>> caller = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], + ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) + + >>> caller + A key + 0 A0 K0 + 1 A1 K1 + 2 A2 K2 + 3 A3 K3 + 4 A4 K4 + 5 A5 K5 + + >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'], + ... 'B': ['B0', 'B1', 'B2']}) + + >>> other + B key + 0 B0 K0 + 1 B1 K1 + 2 B2 K2 + + Join DataFrames using their indexes. + + >>> caller.join(other, lsuffix='_caller', rsuffix='_other') + + >>> A key_caller B key_other + 0 A0 K0 B0 K0 + 1 A1 K1 B1 K1 + 2 A2 K2 B2 K2 + 3 A3 K3 NaN NaN + 4 A4 K4 NaN NaN + 5 A5 K5 NaN NaN + + + If we want to join using the key columns, we need to set key to be + the index in both caller and other. The joined DataFrame will have + key as its index. + + >>> caller.set_index('key').join(other.set_index('key')) + + >>> A B + key + K0 A0 B0 + K1 A1 B1 + K2 A2 B2 + K3 A3 NaN + K4 A4 NaN + K5 A5 NaN + + Another option to join using the key columns is to use the on + parameter. DataFrame.join always uses other's index but we can use any + column in the caller. This method preserves the original caller's + index in the result. + + >>> caller.join(other.set_index('key'), on='key') + + >>> A key B + 0 A0 K0 B0 + 1 A1 K1 B1 + 2 A2 K2 B2 + 3 A3 K3 NaN + 4 A4 K4 NaN + 5 A5 K5 NaN + + + See also + -------- + DataFrame.merge : For column(s)-on-columns(s) operations + Returns ------- joined : DataFrame @@ -4989,31 +5062,27 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, 0.5 2.5 55.0 """ self._check_percentile(q) - if not com.is_list_like(q): - q = [q] - squeeze = True - else: - squeeze = False data = self._get_numeric_data() if numeric_only else self axis = self._get_axis_number(axis) + is_transposed = axis == 1 - def _quantile(series): - res = series.quantile(q, interpolation=interpolation) - return series.name, res - - if axis == 1: + if is_transposed: data = data.T - # unable to use DataFrame.apply, becasuse data may be empty - result = dict(_quantile(s) for (_, s) in data.iteritems()) - result = self._constructor(result, columns=data.columns) - if squeeze: - if result.shape == (1, 1): - result = result.T.iloc[:, 0] # don't want scalar - else: - result = result.T.squeeze() - result.name = None # For groupby, so it can set an index name + result = data._data.quantile(qs=q, + axis=1, + interpolation=interpolation, + transposed=is_transposed) + + if result.ndim == 2: + result = self._constructor(result) + else: + result = self._constructor_sliced(result, name=q) + + if is_transposed: + result = result.T + return result def to_timestamp(self, freq=None, how='start', axis=0, copy=True): @@ -5468,7 +5537,8 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): if columns is None: gen = (list(x.keys()) for x in data) - columns = lib.fast_unique_multiple_list_gen(gen) + sort = not any(isinstance(d, OrderedDict) for d in data) + columns = lib.fast_unique_multiple_list_gen(gen, sort=sort) # assure that they are of the base dict class and not of derived # classes diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6c80ab9d87e33..6f062a28b8dc7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1,4 +1,5 @@ # pylint: disable=W0231,E1101 +import collections import warnings import operator import weakref @@ -20,6 +21,7 @@ import pandas.core.missing as missing import pandas.core.datetools as datetools from pandas.formats.printing import pprint_thing +from pandas.formats.format import format_percentiles from pandas import compat from pandas.compat.numpy import function as nv from pandas.compat import (map, zip, lrange, string_types, @@ -143,7 +145,7 @@ def _init_mgr(self, mgr, axes=None, dtype=None, copy=False): @property def _constructor(self): - """Used when a manipulation result has the same dimesions as the + """Used when a manipulation result has the same dimensions as the original. """ raise AbstractMethodError(self) @@ -2356,7 +2358,11 @@ def _reindex_axis(self, new_index, fill_method, axis, copy): def filter(self, items=None, like=None, regex=None, axis=None): """ - Restrict the info axis to set of items or wildcard + Subset rows or columns of dataframe according to labels in + the specified index. + + Note that this routine does not filter a dataframe on its + contents. The filter is applied to the labels of the index. Parameters ---------- @@ -2366,19 +2372,57 @@ def filter(self, items=None, like=None, regex=None, axis=None): Keep info axis where "arg in col == True" regex : string (regular expression) Keep info axis with re.search(regex, col) == True - axis : int or None - The axis to filter on. By default this is the info axis. The "info - axis" is the axis that is used when indexing with ``[]``. For - example, ``df = DataFrame({'a': [1, 2, 3, 4]]}); df['a']``. So, - the ``DataFrame`` columns are the info axis. + axis : int or string axis name + The axis to filter on. By default this is the info axis, + 'index' for Series, 'columns' for DataFrame + + Returns + ------- + same type as input object + + Examples + -------- + >>> df + one two three + mouse 1 2 3 + rabbit 4 5 6 + + >>> # select columns by name + >>> df.filter(items=['one', 'three']) + one three + mouse 1 3 + rabbit 4 6 + + >>> # select columns by regular expression + >>> df.filter(regex='e$', axis=1) + one three + mouse 1 3 + rabbit 4 6 + + >>> # select rows containing 'bbi' + >>> df.filter(like='bbi', axis=0) + one two three + rabbit 4 5 6 + + See Also + -------- + pandas.DataFrame.select Notes ----- - Arguments are mutually exclusive, but this is not checked for + The ``items``, ``like``, and ``regex`` parameters are + enforced to be mutually exclusive. + ``axis`` defaults to the info axis that is used when indexing + with ``[]``. """ import re + nkw = sum([x is not None for x in [items, like, regex]]) + if nkw > 1: + raise TypeError('Keyword arguments `items`, `like`, or `regex` ' + 'are mutually exclusive') + if axis is None: axis = self._info_axis_name axis_name = self._get_axis_name(axis) @@ -2937,7 +2981,11 @@ def astype(self, dtype, copy=True, raise_on_error=True, **kwargs): Parameters ---------- - dtype : numpy.dtype or Python type + dtype : numpy.dtype, Python type, or dict + Use a numpy.dtype or Python type to cast entire pandas object to the + same type. Alternatively, use {col: dtype, ...}, where col is a + column label and dtype is a numpy.dtype or Python type to cast one + or more of the DataFrame's columns to column-specific types. raise_on_error : raise on invalid input kwargs : keyword arguments to pass on to the constructor @@ -2945,10 +2993,27 @@ def astype(self, dtype, copy=True, raise_on_error=True, **kwargs): ------- casted : type of caller """ - - mgr = self._data.astype(dtype=dtype, copy=copy, - raise_on_error=raise_on_error, **kwargs) - return self._constructor(mgr).__finalize__(self) + if isinstance(dtype, collections.Mapping): + if self.ndim == 1: # i.e. Series + if len(dtype) > 1 or list(dtype.keys())[0] != self.name: + raise KeyError('Only the Series name can be used for ' + 'the key in Series dtype mappings.') + typ = list(dtype.values())[0] + return self.astype(typ, copy, raise_on_error, **kwargs) + + from pandas.tools.merge import concat + casted_cols = [self[col].astype(typ, copy=copy) + for col, typ in dtype.items()] + other_col_labels = self.columns.difference(dtype.keys()) + other_cols = [self[col].copy() if copy else self[col] + for col in other_col_labels] + new_df = concat(casted_cols + other_cols, axis=1) + return new_df.reindex(columns=self.columns, copy=False) + + # else, only a single dtype is given + new_data = self._data.astype(dtype=dtype, copy=copy, + raise_on_error=raise_on_error, **kwargs) + return self._constructor(new_data).__finalize__(self) def copy(self, deep=True): """ @@ -4868,32 +4933,33 @@ def abs(self): @Appender(_shared_docs['describe'] % _shared_doc_kwargs) def describe(self, percentiles=None, include=None, exclude=None): if self.ndim >= 3: - msg = "describe is not implemented on on Panel or PanelND objects." + msg = "describe is not implemented on Panel or PanelND objects." raise NotImplementedError(msg) + elif self.ndim == 2 and self.columns.size == 0: + raise ValueError("Cannot describe a DataFrame without columns") if percentiles is not None: # get them all to be in [0, 1] self._check_percentile(percentiles) + + # median should always be included + if 0.5 not in percentiles: + percentiles.append(0.5) percentiles = np.asarray(percentiles) else: percentiles = np.array([0.25, 0.5, 0.75]) - # median should always be included - if (percentiles != 0.5).all(): # median isn't included - lh = percentiles[percentiles < .5] - uh = percentiles[percentiles > .5] - percentiles = np.hstack([lh, 0.5, uh]) + # sort and check for duplicates + unique_pcts = np.unique(percentiles) + if len(unique_pcts) < len(percentiles): + raise ValueError("percentiles cannot contain duplicates") + percentiles = unique_pcts - def pretty_name(x): - x *= 100 - if x == int(x): - return '%.0f%%' % x - else: - return '%.1f%%' % x + formatted_percentiles = format_percentiles(percentiles) - def describe_numeric_1d(series, percentiles): + def describe_numeric_1d(series): stat_index = (['count', 'mean', 'std', 'min'] + - [pretty_name(x) for x in percentiles] + ['max']) + formatted_percentiles + ['max']) d = ([series.count(), series.mean(), series.std(), series.min()] + [series.quantile(x) for x in percentiles] + [series.max()]) return pd.Series(d, index=stat_index, name=series.name) @@ -4918,18 +4984,18 @@ def describe_categorical_1d(data): return pd.Series(result, index=names, name=data.name) - def describe_1d(data, percentiles): + def describe_1d(data): if com.is_bool_dtype(data): return describe_categorical_1d(data) elif com.is_numeric_dtype(data): - return describe_numeric_1d(data, percentiles) + return describe_numeric_1d(data) elif com.is_timedelta64_dtype(data): - return describe_numeric_1d(data, percentiles) + return describe_numeric_1d(data) else: return describe_categorical_1d(data) if self.ndim == 1: - return describe_1d(self, percentiles) + return describe_1d(self) elif (include is None) and (exclude is None): if len(self._get_numeric_data()._info_axis) > 0: # when some numerics are found, keep only numerics @@ -4944,7 +5010,7 @@ def describe_1d(data, percentiles): else: data = self.select_dtypes(include=include, exclude=exclude) - ldesc = [describe_1d(s, percentiles) for _, s in data.iteritems()] + ldesc = [describe_1d(s) for _, s in data.iteritems()] # set a convenient order for rows names = [] ldesc_indexes = sorted([x.index for x in ldesc], key=len) @@ -4954,8 +5020,7 @@ def describe_1d(data, percentiles): names.append(name) d = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1) - d.columns = self.columns._shallow_copy(values=d.columns.values) - d.columns.names = data.columns.names + d.columns = data.columns.copy() return d def _check_percentile(self, q): @@ -5299,7 +5364,7 @@ def _make_stat_function(cls, name, name1, name2, axis_descr, desc, f): @Appender(_num_doc) def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - nv.validate_stat_func(tuple(), kwargs) + nv.validate_stat_func(tuple(), kwargs, fname=name) if skipna is None: skipna = True if axis is None: @@ -5319,7 +5384,7 @@ def _make_stat_function_ddof(cls, name, name1, name2, axis_descr, desc, f): @Appender(_num_ddof_doc) def stat_func(self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs): - nv.validate_stat_ddof_func(tuple(), kwargs) + nv.validate_stat_ddof_func(tuple(), kwargs, fname=name) if skipna is None: skipna = True if axis is None: @@ -5340,7 +5405,7 @@ def _make_cum_function(cls, name, name1, name2, axis_descr, desc, accum_func, @Appender("Return cumulative {0} over requested axis.".format(name) + _cnum_doc) def cum_func(self, axis=None, dtype=None, out=None, skipna=True, **kwargs): - nv.validate_cum_func(tuple(), kwargs) + nv.validate_cum_func(tuple(), kwargs, fname=name) if axis is None: axis = self._stat_axis_number else: @@ -5374,7 +5439,7 @@ def _make_logical_function(cls, name, name1, name2, axis_descr, desc, f): @Appender(_bool_doc) def logical_func(self, axis=None, bool_only=None, skipna=None, level=None, **kwargs): - nv.validate_logical_func(tuple(), kwargs) + nv.validate_logical_func(tuple(), kwargs, fname=name) if skipna is None: skipna = True if axis is None: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 7a4791189726e..bea62e98e4a2a 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -11,6 +11,7 @@ callable, map ) from pandas import compat +from pandas.compat.numpy import function as nv from pandas.compat.numpy import _np_version_under1p8 from pandas.core.base import (PandasObject, SelectionMixin, GroupByError, DataError, SpecificationError) @@ -36,7 +37,7 @@ is_datetime_or_timedelta_dtype, is_bool, is_bool_dtype, AbstractMethodError, _maybe_fill) -from pandas.core.config import option_context +from pandas.core.config import option_context, is_callable import pandas.lib as lib from pandas.lib import Timestamp import pandas.tslib as tslib @@ -642,9 +643,20 @@ def apply(self, func, *args, **kwargs): func = self._is_builtin_func(func) - @wraps(func) - def f(g): - return func(g, *args, **kwargs) + # this is needed so we don't try and wrap strings. If we could + # resolve functions to their callable functions prior, this + # wouldn't be needed + if args or kwargs: + if is_callable(func): + + @wraps(func) + def f(g): + return func(g, *args, **kwargs) + else: + raise ValueError('func must be a callable if args or ' + 'kwargs are supplied') + else: + f = func # ignore SettingWithCopy here in case the user mutates with option_context('mode.chained_assignment', None): @@ -806,8 +818,9 @@ def reset_identity(values): # reset the identities of the components # of the values to prevent aliasing for v in values: - ax = v._get_axis(self.axis) - ax._reset_identity() + if v is not None: + ax = v._get_axis(self.axis) + ax._reset_identity() return values if not not_indexed_same: @@ -954,12 +967,13 @@ def count(self): @Substitution(name='groupby') @Appender(_doc_template) - def mean(self): + def mean(self, *args, **kwargs): """ Compute mean of groups, excluding missing values For multiple groupings, the result index will be a MultiIndex """ + nv.validate_groupby_func('mean', args, kwargs) try: return self._cython_agg_general('mean') except GroupByError: @@ -993,7 +1007,7 @@ def f(x): @Substitution(name='groupby') @Appender(_doc_template) - def std(self, ddof=1): + def std(self, ddof=1, *args, **kwargs): """ Compute standard deviation of groups, excluding missing values @@ -1005,12 +1019,13 @@ def std(self, ddof=1): degrees of freedom """ - # todo, implement at cython level? + # TODO: implement at Cython level? + nv.validate_groupby_func('std', args, kwargs) return np.sqrt(self.var(ddof=ddof)) @Substitution(name='groupby') @Appender(_doc_template) - def var(self, ddof=1): + def var(self, ddof=1, *args, **kwargs): """ Compute variance of groups, excluding missing values @@ -1021,7 +1036,7 @@ def var(self, ddof=1): ddof : integer, default 1 degrees of freedom """ - + nv.validate_groupby_func('var', args, kwargs) if ddof == 1: return self._cython_agg_general('var') else: @@ -1317,8 +1332,9 @@ def cumcount(self, ascending=True): @Substitution(name='groupby') @Appender(_doc_template) - def cumprod(self, axis=0): + def cumprod(self, axis=0, *args, **kwargs): """Cumulative product for each group""" + nv.validate_groupby_func('cumprod', args, kwargs) if axis != 0: return self.apply(lambda x: x.cumprod(axis=axis)) @@ -1326,8 +1342,9 @@ def cumprod(self, axis=0): @Substitution(name='groupby') @Appender(_doc_template) - def cumsum(self, axis=0): + def cumsum(self, axis=0, *args, **kwargs): """Cumulative sum for each group""" + nv.validate_groupby_func('cumsum', args, kwargs) if axis != 0: return self.apply(lambda x: x.cumprod(axis=axis)) @@ -2669,7 +2686,7 @@ def _wrap_transformed_output(self, output, names=None): def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: # GH #6265 - return Series([], name=self.name) + return Series([], name=self.name, index=keys) def _get_index(): if self.grouper.nkeys > 1: @@ -2776,18 +2793,11 @@ def _transform_fast(self, func): func = getattr(self, func) ids, _, ngroup = self.grouper.group_info - mask = ids != -1 - - out = func().values[ids] - if not mask.all(): - out = np.where(mask, out, np.nan) - - obs = np.zeros(ngroup, dtype='bool') - obs[ids[mask]] = True - if not obs.all(): - out = self._try_cast(out, self._selected_obj) - - return Series(out, index=self.obj.index) + cast = (self.size().fillna(0) > 0).any() + out = algos.take_1d(func().values, ids) + if cast: + out = self._try_cast(out, self.obj) + return Series(out, index=self.obj.index, name=self.obj.name) def filter(self, func, dropna=True, *args, **kwargs): # noqa """ @@ -3223,12 +3233,25 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): from pandas.core.index import _all_indexes_same if len(keys) == 0: - # XXX - return DataFrame({}) + return DataFrame(index=keys) key_names = self.grouper.names - if isinstance(values[0], DataFrame): + # GH12824. + def first_non_None_value(values): + try: + v = next(v for v in values if v is not None) + except StopIteration: + return None + return v + + v = first_non_None_value(values) + + if v is None: + # GH9684. If all values are None, then this will throw an error. + # We'd prefer it return an empty dataframe. + return DataFrame() + elif isinstance(v, DataFrame): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) elif self.grouper.groupings is not None: @@ -3255,21 +3278,15 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): key_index = None # make Nones an empty object - if com._count_not_none(*values) != len(values): - try: - v = next(v for v in values if v is not None) - except StopIteration: - # If all values are None, then this will throw an error. - # We'd prefer it return an empty dataframe. - return DataFrame() - if v is None: - return DataFrame() - elif isinstance(v, NDFrame): - values = [ - x if x is not None else - v._constructor(**v._construct_axes_dict()) - for x in values - ] + v = first_non_None_value(values) + if v is None: + return DataFrame() + elif isinstance(v, NDFrame): + values = [ + x if x is not None else + v._constructor(**v._construct_axes_dict()) + for x in values + ] v = values[0] @@ -3465,19 +3482,28 @@ def transform(self, func, *args, **kwargs): if not result.columns.equals(obj.columns): return self._transform_general(func, *args, **kwargs) - results = np.empty_like(obj.values, result.values.dtype) - for (name, group), (i, row) in zip(self, result.iterrows()): - indexer = self._get_index(name) - if len(indexer) > 0: - results[indexer] = np.tile(row.values, len( - indexer)).reshape(len(indexer), -1) + return self._transform_fast(result, obj) - counts = self.size().fillna(0).values - if any(counts == 0): - results = self._try_cast(results, obj[result.columns]) + def _transform_fast(self, result, obj): + """ + Fast transform path for aggregations + """ + # if there were groups with no observations (Categorical only?) + # try casting data to original dtype + cast = (self.size().fillna(0) > 0).any() + + # for each col, reshape to to size of original frame + # by take operation + ids, _, ngroup = self.grouper.group_info + output = [] + for i, _ in enumerate(result.columns): + res = algos.take_1d(result.iloc[:, i].values, ids) + if cast: + res = self._try_cast(res, obj.iloc[:, i]) + output.append(res) - return (DataFrame(results, columns=result.columns, index=obj.index) - ._convert(datetime=True)) + return DataFrame._from_arrays(output, columns=result.columns, + index=obj.index) def _define_paths(self, func, *args, **kwargs): if isinstance(func, compat.string_types): @@ -3630,17 +3656,12 @@ def _gotitem(self, key, ndim, subset=None): def _wrap_generic_output(self, result, obj): result_index = self.grouper.levels[0] - if result: - if self.axis == 0: - result = DataFrame(result, index=obj.columns, - columns=result_index).T - else: - result = DataFrame(result, index=obj.index, - columns=result_index) + if self.axis == 0: + return DataFrame(result, index=obj.columns, + columns=result_index).T else: - result = DataFrame(result) - - return result + return DataFrame(result, index=obj.index, + columns=result_index) def _get_data_to_aggregate(self): obj = self._obj_with_exclusions diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index acb0675247a78..9485f50ed07f1 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -336,9 +336,12 @@ def _setitem_with_indexer(self, indexer, value): # this preserves dtype of the value new_values = Series([value])._values if len(self.obj._values): - new_values = np.concatenate([self.obj._values, - new_values]) - + try: + new_values = np.concatenate([self.obj._values, + new_values]) + except TypeError: + new_values = np.concatenate([self.obj.asobject, + new_values]) self.obj._data = self.obj._constructor( new_values, index=new_index, name=self.obj.name)._data self.obj._maybe_update_cacher(clear=True) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index abfc5c989056e..97df81ad6be48 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -40,7 +40,7 @@ from pandas.util.decorators import cache_readonly from pandas.tslib import Timedelta -from pandas import compat +from pandas import compat, _np_version_under1p9 from pandas.compat import range, map, zip, u from pandas.lib import BlockPlacement @@ -84,7 +84,7 @@ def __init__(self, values, placement, ndim=None, fastpath=False): self.mgr_locs = placement self.values = values - if len(self.mgr_locs) != len(self.values): + if ndim and len(self.mgr_locs) != len(self.values): raise ValueError('Wrong number of items passed %d, placement ' 'implies %d' % (len(self.values), len(self.mgr_locs))) @@ -180,6 +180,12 @@ def make_block(self, values, placement=None, ndim=None, **kwargs): return make_block(values, placement=placement, ndim=ndim, **kwargs) + def make_block_scalar(self, values, **kwargs): + """ + Create a ScalarBlock + """ + return ScalarBlock(values) + def make_block_same_class(self, values, placement=None, fastpath=True, **kwargs): """ Wrap given values in a block of same type as self. """ @@ -324,7 +330,8 @@ def apply(self, func, mgr=None, **kwargs): """ result = func(self.values, **kwargs) if not isinstance(result, Block): - result = self.make_block(values=_block_shape(result)) + result = self.make_block(values=_block_shape(result, + ndim=self.ndim)) return result @@ -1260,32 +1267,117 @@ def equals(self, other): return False return array_equivalent(self.values, other.values) - def quantile(self, qs, mgr=None, **kwargs): + def quantile(self, qs, interpolation='linear', axis=0, mgr=None): """ compute the quantiles of the Parameters ---------- - qs : a scalar or list of the quantiles to be computed + qs: a scalar or list of the quantiles to be computed + interpolation: type of interpolation, default 'linear' + axis: axis to compute, default 0 + + Returns + ------- + tuple of (axis, block) + """ + if _np_version_under1p9: + if interpolation != 'linear': + raise ValueError("Interpolation methods other than linear " + "are not supported in numpy < 1.9.") + + kw = {} + if not _np_version_under1p9: + kw.update({'interpolation': interpolation}) values = self.get_values() - values, mask, _, _ = self._try_coerce_args(values, values) + values, _, _, _ = self._try_coerce_args(values, values) + mask = isnull(self.values) if not lib.isscalar(mask) and mask.any(): - values = values[~mask] - if len(values) == 0: - if com.is_list_like(qs): - result = np.array([self.fill_value]) + # even though this could be a 2-d mask it appears + # as a 1-d result + mask = mask.reshape(values.shape) + result_shape = tuple([values.shape[0]] + [-1] * (self.ndim - 1)) + values = _block_shape(values[~mask], ndim=self.ndim) + if self.ndim > 1: + values = values.reshape(result_shape) + + from pandas import Float64Index + is_empty = values.shape[axis] == 0 + if com.is_list_like(qs): + ax = Float64Index(qs) + + if is_empty: + if self.ndim == 1: + result = self._na_value + else: + # create the array of na_values + # 2d len(values) * len(qs) + result = np.repeat(np.array([self._na_value] * len(qs)), + len(values)).reshape(len(values), + len(qs)) else: - result = self._na_value - elif com.is_list_like(qs): - values = [_quantile(values, x * 100, **kwargs) for x in qs] - result = np.array(values) + + try: + result = _quantile(values, np.array(qs) * 100, + axis=axis, **kw) + except ValueError: + + # older numpies don't handle an array for q + result = [_quantile(values, q * 100, + axis=axis, **kw) for q in qs] + + result = np.array(result, copy=False) + if self.ndim > 1: + result = result.T + else: - result = _quantile(values, qs * 100, **kwargs) - return self._try_coerce_result(result) + if self.ndim == 1: + ax = Float64Index([qs]) + else: + ax = mgr.axes[0] + + if is_empty: + if self.ndim == 1: + result = self._na_value + else: + result = np.array([self._na_value] * len(self)) + else: + result = _quantile(values, qs * 100, axis=axis, **kw) + + ndim = getattr(result, 'ndim', None) or 0 + result = self._try_coerce_result(result) + if lib.isscalar(result): + return ax, self.make_block_scalar(result) + return ax, make_block(result, + placement=np.arange(len(result)), + ndim=ndim) + + +class ScalarBlock(Block): + """ + a scalar compat Block + """ + __slots__ = ['_mgr_locs', 'values', 'ndim'] + + def __init__(self, values): + self.ndim = 0 + self.mgr_locs = [0] + self.values = values + + @property + def dtype(self): + return type(self.values) + + @property + def shape(self): + return tuple([0]) + + def __len__(self): + return 0 class NonConsolidatableMixIn(object): @@ -1378,6 +1470,8 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, if isinstance(new, np.ndarray) and len(new) == len(mask): new = new[mask] + + mask = mask.reshape(new_values.shape) new_values[mask] = new new_values = self._try_coerce_result(new_values) return [self.make_block(values=new_values)] @@ -1676,6 +1770,7 @@ def convert(self, *args, **kwargs): can return multiple blocks! """ + if args: raise NotImplementedError by_item = True if 'by_item' not in kwargs else kwargs['by_item'] @@ -1706,8 +1801,13 @@ def convert(self, *args, **kwargs): for i, rl in enumerate(self.mgr_locs): values = self.iget(i) - values = fn(values.ravel(), **fn_kwargs).reshape(values.shape) - values = _block_shape(values, ndim=self.ndim) + shape = values.shape + values = fn(values.ravel(), **fn_kwargs) + try: + values = values.reshape(shape) + values = _block_shape(values, ndim=self.ndim) + except AttributeError: + pass newb = make_block(values, ndim=self.ndim, placement=[rl]) blocks.append(newb) @@ -2115,7 +2215,10 @@ def _try_coerce_result(self, result): """ reverse of try_coerce_args """ if isinstance(result, np.ndarray): if result.dtype.kind in ['i', 'f', 'O']: - result = result.astype('M8[ns]') + try: + result = result.astype('M8[ns]') + except ValueError: + pass elif isinstance(result, (np.integer, np.float, np.datetime64)): result = self._box_func(result) return result @@ -2219,11 +2322,6 @@ def to_object_block(self, mgr): kwargs['placement'] = [0] return self.make_block(values, klass=ObjectBlock, **kwargs) - def replace(self, *args, **kwargs): - # if we are forced to ObjectBlock, then don't coerce (to UTC) - kwargs['convert'] = False - return super(DatetimeTZBlock, self).replace(*args, **kwargs) - def _slice(self, slicer): """ return a slice of my values """ if isinstance(slicer, tuple): @@ -2246,8 +2344,8 @@ def _try_coerce_args(self, values, other): ------- base-type values, values mask, base-type other, other mask """ - values_mask = isnull(values) - values = values.tz_localize(None).asi8 + values_mask = _block_shape(isnull(values), ndim=self.ndim) + values = _block_shape(values.tz_localize(None).asi8, ndim=self.ndim) other_mask = False if isinstance(other, ABCSeries): @@ -2283,6 +2381,9 @@ def _try_coerce_result(self, result): elif isinstance(result, (np.integer, np.float, np.datetime64)): result = lib.Timestamp(result).tz_localize(self.values.tz) if isinstance(result, np.ndarray): + # allow passing of > 1dim if its trivial + if result.ndim > 1: + result = result.reshape(len(result)) result = self._holder(result).tz_localize(self.values.tz) return result @@ -2809,7 +2910,7 @@ def _verify_integrity(self): len(self.items), tot_items)) def apply(self, f, axes=None, filter=None, do_integrity_check=False, - consolidate=True, raw=False, **kwargs): + consolidate=True, **kwargs): """ iterate over the blocks, collect and create a new block manager @@ -2823,7 +2924,6 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, integrity check consolidate: boolean, default True. Join together blocks having same dtype - raw: boolean, default False. Return the raw returned results Returns ------- @@ -2890,17 +2990,102 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, applied = getattr(b, f)(**kwargs) result_blocks = _extend_blocks(applied, result_blocks) - if raw: - if self._is_single_block: - return result_blocks[0] - return result_blocks - elif len(result_blocks) == 0: + if len(result_blocks) == 0: return self.make_empty(axes or self.axes) bm = self.__class__(result_blocks, axes or self.axes, do_integrity_check=do_integrity_check) bm._consolidate_inplace() return bm + def reduction(self, f, axis=0, consolidate=True, transposed=False, + **kwargs): + """ + iterate over the blocks, collect and create a new block manager. + This routine is intended for reduction type operations and + will do inference on the generated blocks. + + Parameters + ---------- + f: the callable or function name to operate on at the block level + axis: reduction axis, default 0 + consolidate: boolean, default True. Join together blocks having same + dtype + transposed: boolean, default False + we are holding transposed data + + Returns + ------- + Block Manager (new object) + + """ + + if consolidate: + self._consolidate_inplace() + + axes, blocks = [], [] + for b in self.blocks: + kwargs['mgr'] = self + axe, block = getattr(b, f)(axis=axis, **kwargs) + + axes.append(axe) + blocks.append(block) + + # note that some DatetimeTZ, Categorical are always ndim==1 + ndim = set([b.ndim for b in blocks]) + + if 2 in ndim: + + new_axes = list(self.axes) + + # multiple blocks that are reduced + if len(blocks) > 1: + new_axes[1] = axes[0] + + # reset the placement to the original + for b, sb in zip(blocks, self.blocks): + b.mgr_locs = sb.mgr_locs + + else: + new_axes[axis] = Index(np.concatenate( + [ax.values for ax in axes])) + + if transposed: + new_axes = new_axes[::-1] + blocks = [b.make_block(b.values.T, + placement=np.arange(b.shape[1]) + ) for b in blocks] + + return self.__class__(blocks, new_axes) + + # 0 ndim + if 0 in ndim and 1 not in ndim: + values = np.array([b.values for b in blocks]) + if len(values) == 1: + return values.item() + blocks = [make_block(values, ndim=1)] + axes = Index([ax[0] for ax in axes]) + + # single block + values = _concat._concat_compat([b.values for b in blocks]) + + # compute the orderings of our original data + if len(self.blocks) > 1: + + indexer = np.empty(len(self.axes[0]), dtype='int64') + i = 0 + for b in self.blocks: + for j in b.mgr_locs: + indexer[j] = i + i = i + 1 + + values = values.take(indexer) + + return SingleBlockManager( + [make_block(values, + ndim=1, + placement=np.arange(len(values)))], + axes[0]) + def isnull(self, **kwargs): return self.apply('apply', **kwargs) @@ -2911,7 +3096,7 @@ def eval(self, **kwargs): return self.apply('eval', **kwargs) def quantile(self, **kwargs): - return self.apply('quantile', raw=True, **kwargs) + return self.reduction('quantile', **kwargs) def setitem(self, **kwargs): return self.apply('setitem', **kwargs) @@ -3068,7 +3253,6 @@ def combine(self, blocks, copy=True): indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks])) inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0]) - new_items = self.items.take(indexer) new_blocks = [] for b in blocks: @@ -3077,9 +3261,10 @@ def combine(self, blocks, copy=True): axis=0, allow_fill=False) new_blocks.append(b) - new_axes = list(self.axes) - new_axes[0] = new_items - return self.__class__(new_blocks, new_axes, do_integrity_check=False) + axes = list(self.axes) + axes[0] = self.items.take(indexer) + + return self.__class__(new_blocks, axes, do_integrity_check=False) def get_slice(self, slobj, axis=0): if axis >= self.ndim: @@ -3829,6 +4014,16 @@ def _block(self): def _values(self): return self._block.values + @property + def _blknos(self): + """ compat with BlockManager """ + return None + + @property + def _blklocs(self): + """ compat with BlockManager """ + return None + def reindex(self, new_axis, indexer=None, method=None, fill_value=None, limit=None, copy=True): # if we are the same and don't copy, just return @@ -4317,7 +4512,7 @@ def _extend_blocks(result, blocks=None): def _block_shape(values, ndim=1, shape=None): """ guarantee the shape of the values to be at least 1 d """ - if values.ndim <= ndim: + if values.ndim < ndim: if shape is None: shape = values.shape values = values.reshape(tuple((1, ) + shape)) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 63fea71895da2..f27a83f50e115 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -19,6 +19,7 @@ from pandas.tslib import iNaT from pandas.compat import bind_method import pandas.core.missing as missing +import pandas.algos as _algos import pandas.core.algorithms as algos from pandas.core.common import (is_list_like, notnull, isnull, _values_from_object, _maybe_match_name, @@ -421,7 +422,7 @@ def _convert_to_array(self, values, name=None, other=None): values = tslib.array_to_datetime(values) elif inferred_type in ('timedelta', 'timedelta64'): # have a timedelta, convert to to ns here - values = to_timedelta(values, errors='coerce') + values = to_timedelta(values, errors='coerce', box=False) elif inferred_type == 'integer': # py3 compat where dtype is 'm' but is an integer if values.dtype.kind == 'm': @@ -503,9 +504,9 @@ def _offset(lvalues, rvalues): # convert Tick DateOffset to underlying delta if self.is_offset_lhs: - lvalues = to_timedelta(lvalues) + lvalues = to_timedelta(lvalues, box=False) if self.is_offset_rhs: - rvalues = to_timedelta(rvalues) + rvalues = to_timedelta(rvalues, box=False) lvalues = lvalues.astype(np.int64) if not self.is_floating_rhs: @@ -600,6 +601,21 @@ def na_op(x, y): result = missing.fill_zeros(result, x, y, name, fill_zeros) return result + def safe_na_op(lvalues, rvalues): + try: + return na_op(lvalues, rvalues) + except Exception: + if isinstance(rvalues, ABCSeries): + if is_object_dtype(rvalues): + # if dtype is object, try elementwise op + return _algos.arrmap_object(rvalues, + lambda x: op(lvalues, x)) + else: + if is_object_dtype(lvalues): + return _algos.arrmap_object(lvalues, + lambda x: op(x, rvalues)) + raise + def wrapper(left, right, name=name, na_op=na_op): if isinstance(right, pd.DataFrame): @@ -638,9 +654,8 @@ def wrapper(left, right, name=name, na_op=na_op): if ridx is not None: rvalues = algos.take_1d(rvalues, ridx) - arr = na_op(lvalues, rvalues) - - return left._constructor(wrap_results(arr), index=index, + result = wrap_results(safe_na_op(lvalues, rvalues)) + return left._constructor(result, index=index, name=name, dtype=dtype) else: # scalars @@ -648,7 +663,8 @@ def wrapper(left, right, name=name, na_op=na_op): not isinstance(lvalues, pd.DatetimeIndex)): lvalues = lvalues.values - return left._constructor(wrap_results(na_op(lvalues, rvalues)), + result = wrap_results(safe_na_op(lvalues, rvalues)) + return left._constructor(result, index=left.index, name=left.name, dtype=dtype) @@ -738,7 +754,10 @@ def wrapper(self, other, axis=None): elif isinstance(other, pd.DataFrame): # pragma: no cover return NotImplemented elif isinstance(other, (np.ndarray, pd.Index)): - if len(self) != len(other): + # do not check length of zerodim array + # as it will broadcast + if (not lib.isscalar(lib.item_from_zerodim(other)) and + len(self) != len(other)): raise ValueError('Lengths must match to compare') return self._constructor(na_op(self.values, np.asarray(other)), index=self.index).__finalize__(self) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 7e0c094aec4c2..8d237016d1b33 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -162,9 +162,12 @@ def get_result(self): # may need to coerce categoricals here if self.is_categorical is not None: - values = [Categorical.from_array( - values[:, i], categories=self.is_categorical.categories, - ordered=True) for i in range(values.shape[-1])] + categories = self.is_categorical.categories + ordered = self.is_categorical.ordered + values = [Categorical.from_array(values[:, i], + categories=categories, + ordered=ordered) + for i in range(values.shape[-1])] return DataFrame(values, index=index, columns=columns) diff --git a/pandas/core/series.py b/pandas/core/series.py index 58e983ad904ba..43b4ba3a51212 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -57,8 +57,6 @@ from pandas.core.config import get_option -from pandas import _np_version_under1p9 - __all__ = ['Series'] _shared_doc_kwargs = dict( @@ -1349,21 +1347,12 @@ def quantile(self, q=0.5, interpolation='linear'): self._check_percentile(q) - if _np_version_under1p9: - if interpolation != 'linear': - raise ValueError("Interpolation methods other than linear " - "are not supported in numpy < 1.9.") - - kwargs = dict() - if not _np_version_under1p9: - kwargs.update({'interpolation': interpolation}) + result = self._data.quantile(qs=q, interpolation=interpolation) - result = self._data.quantile(qs=q, **kwargs) - - if com.is_list_like(result): - # explicitly use Float64Index to coerce empty result to float dtype - index = Float64Index(q) - return self._constructor(result, index=index, name=self.name) + if com.is_list_like(q): + return self._constructor(result, + index=Float64Index(q), + name=self.name) else: # scalar return result diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 524c0205d7f73..5b1b8bd05af42 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -8,6 +8,7 @@ from pandas.core.algorithms import take_1d import pandas.compat as compat from pandas.core.base import AccessorProperty, NoNewAttributesMixin +from pandas.types import api as gt from pandas.util.decorators import Appender, deprecate_kwarg import re import pandas.lib as lib @@ -148,12 +149,10 @@ def _na_map(f, arr, na_result=np.nan, dtype=object): def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object): - from pandas.core.series import Series - if not len(arr): return np.ndarray(0, dtype=dtype) - if isinstance(arr, Series): + if isinstance(arr, gt.ABCSeries): arr = arr.values if not isinstance(arr, np.ndarray): arr = np.asarray(arr, dtype=object) @@ -687,33 +686,42 @@ def str_extractall(arr, pat, flags=0): C 0 NaN 1 """ - from pandas import DataFrame, MultiIndex + regex = re.compile(pat, flags=flags) # the regex must contain capture groups. if regex.groups == 0: raise ValueError("pattern contains no capture groups") + + if isinstance(arr, gt.ABCIndex): + arr = arr.to_series().reset_index(drop=True) + names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) columns = [names.get(1 + i, i) for i in range(regex.groups)] match_list = [] index_list = [] + is_mi = arr.index.nlevels > 1 + for subject_key, subject in arr.iteritems(): if isinstance(subject, compat.string_types): - try: - key_list = list(subject_key) - except TypeError: - key_list = [subject_key] + + if not is_mi: + subject_key = (subject_key, ) + for match_i, match_tuple in enumerate(regex.findall(subject)): - na_tuple = [ - np.NaN if group == "" else group for group in match_tuple] + na_tuple = [np.NaN if group == "" else group + for group in match_tuple] match_list.append(na_tuple) - result_key = tuple(key_list + [match_i]) + result_key = tuple(subject_key + (match_i, )) index_list.append(result_key) + if 0 < len(index_list): + from pandas import MultiIndex index = MultiIndex.from_tuples( index_list, names=arr.index.names + ["match"]) else: index = None - result = DataFrame(match_list, index, columns) + result = arr._constructor_expanddim(match_list, index=index, + columns=columns) return result @@ -1804,9 +1812,9 @@ class StringAccessorMixin(object): # string methods def _make_str_accessor(self): - from pandas.core.series import Series from pandas.core.index import Index - if (isinstance(self, Series) and + + if (isinstance(self, gt.ABCSeries) and not ((is_categorical_dtype(self.dtype) and is_object_dtype(self.values.categories)) or (is_object_dtype(self.dtype)))): @@ -1819,6 +1827,8 @@ def _make_str_accessor(self): "values, which use np.object_ dtype in " "pandas") elif isinstance(self, Index): + # can't use ABCIndex to exclude non-str + # see scc/inferrence.pyx which can contain string values allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer') if self.inferred_type not in allowed_types: diff --git a/pandas/core/window.py b/pandas/core/window.py index b1be66bee9bc8..cd66d4e30c351 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -18,6 +18,7 @@ import pandas.core.common as com import pandas.algos as algos from pandas import compat +from pandas.compat.numpy import function as nv from pandas.util.decorators import Substitution, Appender from textwrap import dedent @@ -435,13 +436,15 @@ def aggregate(self, arg, *args, **kwargs): @Substitution(name='window') @Appender(_doc_template) @Appender(_shared_docs['sum']) - def sum(self, **kwargs): + def sum(self, *args, **kwargs): + nv.validate_window_func('sum', args, kwargs) return self._apply_window(mean=False, **kwargs) @Substitution(name='window') @Appender(_doc_template) @Appender(_shared_docs['mean']) - def mean(self, **kwargs): + def mean(self, *args, **kwargs): + nv.validate_window_func('mean', args, kwargs) return self._apply_window(mean=True, **kwargs) @@ -620,7 +623,8 @@ def f(arg, window, min_periods): return self._apply(f, func, args=args, kwargs=kwargs, center=False) - def sum(self, **kwargs): + def sum(self, *args, **kwargs): + nv.validate_window_func('sum', args, kwargs) return self._apply('roll_sum', 'sum', **kwargs) _shared_docs['max'] = dedent(""" @@ -631,7 +635,8 @@ def sum(self, **kwargs): how : string, default 'max' (DEPRECATED) Method for down- or re-sampling""") - def max(self, how=None, **kwargs): + def max(self, how=None, *args, **kwargs): + nv.validate_window_func('max', args, kwargs) if self.freq is not None and how is None: how = 'max' return self._apply('roll_max', 'max', how=how, **kwargs) @@ -644,12 +649,14 @@ def max(self, how=None, **kwargs): how : string, default 'min' (DEPRECATED) Method for down- or re-sampling""") - def min(self, how=None, **kwargs): + def min(self, how=None, *args, **kwargs): + nv.validate_window_func('min', args, kwargs) if self.freq is not None and how is None: how = 'min' return self._apply('roll_min', 'min', how=how, **kwargs) - def mean(self, **kwargs): + def mean(self, *args, **kwargs): + nv.validate_window_func('mean', args, kwargs) return self._apply('roll_mean', 'mean', **kwargs) _shared_docs['median'] = dedent(""" @@ -674,7 +681,8 @@ def median(self, how=None, **kwargs): Delta Degrees of Freedom. The divisor used in calculations is ``N - ddof``, where ``N`` represents the number of elements.""") - def std(self, ddof=1, **kwargs): + def std(self, ddof=1, *args, **kwargs): + nv.validate_window_func('std', args, kwargs) window = self._get_window() def f(arg, *args, **kwargs): @@ -693,7 +701,8 @@ def f(arg, *args, **kwargs): Delta Degrees of Freedom. The divisor used in calculations is ``N - ddof``, where ``N`` represents the number of elements.""") - def var(self, ddof=1, **kwargs): + def var(self, ddof=1, *args, **kwargs): + nv.validate_window_func('var', args, kwargs) return self._apply('roll_var', 'var', check_minp=_require_min_periods(1), ddof=ddof, **kwargs) @@ -865,26 +874,30 @@ def apply(self, func, args=(), kwargs={}): @Substitution(name='rolling') @Appender(_doc_template) @Appender(_shared_docs['sum']) - def sum(self, **kwargs): - return super(Rolling, self).sum(**kwargs) + def sum(self, *args, **kwargs): + nv.validate_rolling_func('sum', args, kwargs) + return super(Rolling, self).sum(*args, **kwargs) @Substitution(name='rolling') @Appender(_doc_template) @Appender(_shared_docs['max']) - def max(self, **kwargs): - return super(Rolling, self).max(**kwargs) + def max(self, *args, **kwargs): + nv.validate_rolling_func('max', args, kwargs) + return super(Rolling, self).max(*args, **kwargs) @Substitution(name='rolling') @Appender(_doc_template) @Appender(_shared_docs['min']) - def min(self, **kwargs): - return super(Rolling, self).min(**kwargs) + def min(self, *args, **kwargs): + nv.validate_rolling_func('min', args, kwargs) + return super(Rolling, self).min(*args, **kwargs) @Substitution(name='rolling') @Appender(_doc_template) @Appender(_shared_docs['mean']) - def mean(self, **kwargs): - return super(Rolling, self).mean(**kwargs) + def mean(self, *args, **kwargs): + nv.validate_rolling_func('mean', args, kwargs) + return super(Rolling, self).mean(*args, **kwargs) @Substitution(name='rolling') @Appender(_doc_template) @@ -895,13 +908,15 @@ def median(self, **kwargs): @Substitution(name='rolling') @Appender(_doc_template) @Appender(_shared_docs['std']) - def std(self, ddof=1, **kwargs): + def std(self, ddof=1, *args, **kwargs): + nv.validate_rolling_func('std', args, kwargs) return super(Rolling, self).std(ddof=ddof, **kwargs) @Substitution(name='rolling') @Appender(_doc_template) @Appender(_shared_docs['var']) - def var(self, ddof=1, **kwargs): + def var(self, ddof=1, *args, **kwargs): + nv.validate_rolling_func('var', args, kwargs) return super(Rolling, self).var(ddof=ddof, **kwargs) @Substitution(name='rolling') @@ -985,10 +1000,8 @@ class Expanding(_Rolling_and_Expanding): def __init__(self, obj, min_periods=1, freq=None, center=False, axis=0, **kwargs): - return super(Expanding, self).__init__(obj=obj, - min_periods=min_periods, - freq=freq, center=center, - axis=axis) + super(Expanding, self).__init__(obj=obj, min_periods=min_periods, + freq=freq, center=center, axis=axis) @property def _constructor(self): @@ -1025,26 +1038,30 @@ def apply(self, func, args=(), kwargs={}): @Substitution(name='expanding') @Appender(_doc_template) @Appender(_shared_docs['sum']) - def sum(self, **kwargs): - return super(Expanding, self).sum(**kwargs) + def sum(self, *args, **kwargs): + nv.validate_expanding_func('sum', args, kwargs) + return super(Expanding, self).sum(*args, **kwargs) @Substitution(name='expanding') @Appender(_doc_template) @Appender(_shared_docs['max']) - def max(self, **kwargs): - return super(Expanding, self).max(**kwargs) + def max(self, *args, **kwargs): + nv.validate_expanding_func('max', args, kwargs) + return super(Expanding, self).max(*args, **kwargs) @Substitution(name='expanding') @Appender(_doc_template) @Appender(_shared_docs['min']) - def min(self, **kwargs): - return super(Expanding, self).min(**kwargs) + def min(self, *args, **kwargs): + nv.validate_expanding_func('min', args, kwargs) + return super(Expanding, self).min(*args, **kwargs) @Substitution(name='expanding') @Appender(_doc_template) @Appender(_shared_docs['mean']) - def mean(self, **kwargs): - return super(Expanding, self).mean(**kwargs) + def mean(self, *args, **kwargs): + nv.validate_expanding_func('mean', args, kwargs) + return super(Expanding, self).mean(*args, **kwargs) @Substitution(name='expanding') @Appender(_doc_template) @@ -1055,13 +1072,15 @@ def median(self, **kwargs): @Substitution(name='expanding') @Appender(_doc_template) @Appender(_shared_docs['std']) - def std(self, ddof=1, **kwargs): + def std(self, ddof=1, *args, **kwargs): + nv.validate_expanding_func('std', args, kwargs) return super(Expanding, self).std(ddof=ddof, **kwargs) @Substitution(name='expanding') @Appender(_doc_template) @Appender(_shared_docs['var']) - def var(self, ddof=1, **kwargs): + def var(self, ddof=1, *args, **kwargs): + nv.validate_expanding_func('var', args, kwargs) return super(Expanding, self).var(ddof=ddof, **kwargs) @Substitution(name='expanding') @@ -1275,15 +1294,17 @@ def func(arg): @Substitution(name='ewm') @Appender(_doc_template) - def mean(self, **kwargs): + def mean(self, *args, **kwargs): """exponential weighted moving average""" + nv.validate_window_func('mean', args, kwargs) return self._apply('ewma', **kwargs) @Substitution(name='ewm') @Appender(_doc_template) @Appender(_bias_template) - def std(self, bias=False, **kwargs): + def std(self, bias=False, *args, **kwargs): """exponential weighted moving stddev""" + nv.validate_window_func('std', args, kwargs) return _zsqrt(self.var(bias=bias, **kwargs)) vol = std @@ -1291,8 +1312,9 @@ def std(self, bias=False, **kwargs): @Substitution(name='ewm') @Appender(_doc_template) @Appender(_bias_template) - def var(self, bias=False, **kwargs): + def var(self, bias=False, *args, **kwargs): """exponential weighted moving variance""" + nv.validate_window_func('var', args, kwargs) def f(arg): return algos.ewmcov(arg, arg, self.com, int(self.adjust), diff --git a/pandas/formats/format.py b/pandas/formats/format.py index c3ffc018d1031..27d8b553013b9 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -6,7 +6,7 @@ import sys from pandas.core.base import PandasObject -from pandas.core.common import isnull, notnull +from pandas.core.common import isnull, notnull, is_numeric_dtype from pandas.core.index import Index, MultiIndex, _ensure_index from pandas import compat from pandas.compat import (StringIO, lzip, range, map, zip, reduce, u, @@ -2260,6 +2260,68 @@ def _format_strings(self): return fmt_values +def format_percentiles(percentiles): + """ + Outputs rounded and formatted percentiles. + + Parameters + ---------- + percentiles : list-like, containing floats from interval [0,1] + + Returns + ------- + formatted : list of strings + + Notes + ----- + Rounding precision is chosen so that: (1) if any two elements of + ``percentiles`` differ, they remain different after rounding + (2) no entry is *rounded* to 0% or 100%. + Any non-integer is always rounded to at least 1 decimal place. + + Examples + -------- + Keeps all entries different after rounding: + + >>> format_percentiles([0.01999, 0.02001, 0.5, 0.666666, 0.9999]) + ['1.999%', '2.001%', '50%', '66.667%', '99.99%'] + + No element is rounded to 0% or 100% (unless already equal to it). + Duplicates are allowed: + + >>> format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999]) + ['0%', '50%', '2.0%', '50%', '66.67%', '99.99%'] + """ + + percentiles = np.asarray(percentiles) + + # It checks for np.NaN as well + if not is_numeric_dtype(percentiles) or not np.all(percentiles >= 0) \ + or not np.all(percentiles <= 1): + raise ValueError("percentiles should all be in the interval [0,1]") + + percentiles = 100 * percentiles + int_idx = (percentiles.astype(int) == percentiles) + + if np.all(int_idx): + out = percentiles.astype(int).astype(str) + return [i + '%' for i in out] + + unique_pcts = np.unique(percentiles) + to_begin = unique_pcts[0] if unique_pcts[0] > 0 else None + to_end = 100 - unique_pcts[-1] if unique_pcts[-1] < 100 else None + + # Least precision that keeps percentiles unique after rounding + prec = -np.floor(np.log10(np.min( + np.ediff1d(unique_pcts, to_begin=to_begin, to_end=to_end) + ))).astype(int) + prec = max(1, prec) + out = np.empty_like(percentiles, dtype=object) + out[int_idx] = percentiles[int_idx].astype(int).astype(str) + out[~int_idx] = percentiles[~int_idx].round(prec).astype(str) + return [i + '%' for i in out] + + def _is_dates_only(values): # return a boolean if we are only dates (and don't have a timezone) values = DatetimeIndex(values) @@ -2590,6 +2652,9 @@ def __call__(self, num): import math dnum = decimal.Decimal(str(num)) + if decimal.Decimal.is_nan(dnum): + return 'NaN' + sign = 1 if dnum < 0: # pragma: no cover diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index dc178c1178c74..82f16becbd511 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -465,6 +465,24 @@ def repeat(self, n, *args, **kwargs): nv.validate_repeat(args, kwargs) return self._shallow_copy(self._values.repeat(n)) + def where(self, cond, other=None): + """ + .. versionadded:: 0.18.2 + + Return an Index of same shape as self and whose corresponding + entries are from self where cond is True and otherwise are from + other. + + Parameters + ---------- + cond : boolean same length as self + other : scalar, or array-like + """ + if other is None: + other = self._na_value + values = np.where(cond, self.values, other) + return self._shallow_copy_with_infer(values, dtype=self.dtype) + def ravel(self, order='C'): """ return an ndarray of the flattened values of the underlying data @@ -754,8 +772,28 @@ def _to_embed(self, keep_tz=False): """ return self.values.copy() - def astype(self, dtype): - return Index(self.values.astype(dtype), name=self.name, dtype=dtype) + _index_shared_docs['astype'] = """ + Create an Index with values cast to dtypes. The class of a new Index + is determined by dtype. When conversion is impossible, a ValueError + exception is raised. + + Parameters + ---------- + dtype : numpy dtype or pandas type + copy : bool, default True + By default, astype always returns a newly allocated object. + If copy is set to False and internal requirements on dtype are + satisfied, the original data is used to create a new Index + or the original Index is returned. + + .. versionadded:: 0.18.2 + + """ + + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): + return Index(self.values.astype(dtype, copy=copy), name=self.name, + dtype=dtype) def _to_safe_for_reshape(self): """ convert to object if we are a categorical """ diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index 8f343c5de5fb6..e877e43bcc603 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -307,6 +307,29 @@ def _can_reindex(self, indexer): """ always allow reindexing """ pass + def where(self, cond, other=None): + """ + .. versionadded:: 0.18.2 + + Return an Index of same shape as self and whose corresponding + entries are from self where cond is True and otherwise are from + other. + + Parameters + ---------- + cond : boolean same length as self + other : scalar, or array-like + """ + if other is None: + other = self._na_value + values = np.where(cond, self.values, other) + + from pandas.core.categorical import Categorical + cat = Categorical(values, + categories=self.categories, + ordered=self.ordered) + return self._shallow_copy(cat, **self._get_attributes_dict()) + def reindex(self, target, method=None, level=None, limit=None, tolerance=None): """ diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 3effc9b1315e6..05b2045a4850f 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -592,7 +592,6 @@ def fillna(self, value=None, downcast=None): def get_value(self, series, key): # somewhat broken encapsulation from pandas.core.indexing import maybe_droplevels - from pandas.core.series import Series # Label-based s = _values_from_object(series) @@ -604,7 +603,8 @@ def _try_mi(k): new_values = series._values[loc] new_index = self[loc] new_index = maybe_droplevels(new_index, k) - return Series(new_values, index=new_index, name=series.name) + return series._constructor(new_values, index=new_index, + name=series.name).__finalize__(self) try: return self._engine.get_value(s, k) @@ -1084,6 +1084,10 @@ def repeat(self, n, *args, **kwargs): for label in self.labels], names=self.names, sortorder=self.sortorder, verify_integrity=False) + def where(self, cond, other=None): + raise NotImplementedError(".where is not supported for " + "MultiIndex operations") + def drop(self, labels, level=None, errors='raise'): """ Make new MultiIndex with passed list of labels deleted @@ -1761,7 +1765,8 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels): else: m = np.zeros(len(labels), dtype=bool) - m[np.in1d(labels, r, assume_unique=True)] = True + m[np.in1d(labels, r, + assume_unique=Index(labels).is_unique)] = True return m @@ -2073,11 +2078,14 @@ def difference(self, other): return MultiIndex.from_tuples(difference, sortorder=0, names=result_names) - def astype(self, dtype): + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): if not is_object_dtype(np.dtype(dtype)): raise TypeError('Setting %s dtype to anything other than object ' 'is not supported' % self.__class__) - return self._shallow_copy() + elif copy is True: + return self._shallow_copy() + return self def _convert_can_do_setop(self, other): result_names = self.names diff --git a/pandas/indexes/numeric.py b/pandas/indexes/numeric.py index 983ea731b11ac..0deaf4da9b2bb 100644 --- a/pandas/indexes/numeric.py +++ b/pandas/indexes/numeric.py @@ -4,7 +4,7 @@ import pandas.index as _index from pandas import compat -from pandas.indexes.base import Index, InvalidIndexError +from pandas.indexes.base import Index, InvalidIndexError, _index_shared_docs from pandas.util.decorators import Appender, cache_readonly import pandas.core.common as com from pandas.core.common import (is_dtype_equal, isnull, pandas_dtype, @@ -238,12 +238,17 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, def inferred_type(self): return 'floating' - def astype(self, dtype): + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) - if is_float_dtype(dtype) or is_integer_dtype(dtype): - values = self._values.astype(dtype) + if is_float_dtype(dtype): + values = self._values.astype(dtype, copy=copy) + elif is_integer_dtype(dtype): + if self.hasnans: + raise ValueError('cannot convert float NaN to integer') + values = self._values.astype(dtype, copy=copy) elif is_object_dtype(dtype): - values = self._values + values = self._values.astype('object', copy=copy) else: raise TypeError('Setting %s dtype to anything other than ' 'float64 or object is not supported' % diff --git a/pandas/io/common.py b/pandas/io/common.py index dc7c483c1fb68..cf4bba6e97afb 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -104,85 +104,6 @@ def __next__(self): BaseIterator.next = lambda self: self.__next__() -try: - from boto.s3 import key - - class BotoFileLikeReader(key.Key): - """boto Key modified to be more file-like - - This modification of the boto Key will read through a supplied - S3 key once, then stop. The unmodified boto Key object will repeatedly - cycle through a file in S3: after reaching the end of the file, - boto will close the file. Then the next call to `read` or `next` will - re-open the file and start reading from the beginning. - - Also adds a `readline` function which will split the returned - values by the `\n` character. - """ - - def __init__(self, *args, **kwargs): - encoding = kwargs.pop("encoding", None) # Python 2 compat - super(BotoFileLikeReader, self).__init__(*args, **kwargs) - # Add a flag to mark the end of the read. - self.finished_read = False - self.buffer = "" - self.lines = [] - if encoding is None and compat.PY3: - encoding = "utf-8" - self.encoding = encoding - self.lines = [] - - def next(self): - return self.readline() - - __next__ = next - - def read(self, *args, **kwargs): - if self.finished_read: - return b'' if compat.PY3 else '' - return super(BotoFileLikeReader, self).read(*args, **kwargs) - - def close(self, *args, **kwargs): - self.finished_read = True - return super(BotoFileLikeReader, self).close(*args, **kwargs) - - def seekable(self): - """Needed for reading by bz2""" - return False - - def readline(self): - """Split the contents of the Key by '\n' characters.""" - if self.lines: - retval = self.lines[0] - self.lines = self.lines[1:] - return retval - if self.finished_read: - if self.buffer: - retval, self.buffer = self.buffer, "" - return retval - else: - raise StopIteration - - if self.encoding: - self.buffer = "{}{}".format( - self.buffer, self.read(8192).decode(self.encoding)) - else: - self.buffer = "{}{}".format(self.buffer, self.read(8192)) - - split_buffer = self.buffer.split("\n") - self.lines.extend(split_buffer[:-1]) - self.buffer = split_buffer[-1] - - return self.readline() -except ImportError: - # boto is only needed for reading from S3. - pass -except TypeError: - # boto/boto3 issues - # GH11915 - pass - - def _is_url(url): """Check to see if a URL has a valid protocol. @@ -319,32 +240,10 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, return tuple(to_return) if _is_s3_url(filepath_or_buffer): - try: - import boto - except: - raise ImportError("boto is required to handle s3 files") - # Assuming AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY and AWS_S3_HOST - # are environment variables - parsed_url = parse_url(filepath_or_buffer) - s3_host = os.environ.get('AWS_S3_HOST', 's3.amazonaws.com') - - try: - conn = boto.connect_s3(host=s3_host) - except boto.exception.NoAuthHandlerFound: - conn = boto.connect_s3(host=s3_host, anon=True) - - b = conn.get_bucket(parsed_url.netloc, validate=False) - if compat.PY2 and (compression == 'gzip' or - (compression == 'infer' and - filepath_or_buffer.endswith(".gz"))): - k = boto.s3.key.Key(b, parsed_url.path) - filepath_or_buffer = BytesIO(k.get_contents_as_string( - encoding=encoding)) - else: - k = BotoFileLikeReader(b, parsed_url.path, encoding=encoding) - k.open('r') # Expose read errors immediately - filepath_or_buffer = k - return filepath_or_buffer, None, compression + from pandas.io.s3 import get_filepath_or_buffer + return get_filepath_or_buffer(filepath_or_buffer, + encoding=encoding, + compression=compression) # It is a pathlib.Path/py.path.local or string filepath_or_buffer = _stringify_path(filepath_or_buffer) diff --git a/pandas/io/html.py b/pandas/io/html.py index e350a40bfa805..48caaa39dd711 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -612,7 +612,8 @@ def _expand_elements(body): def _data_to_frame(data, header, index_col, skiprows, - parse_dates, tupleize_cols, thousands): + parse_dates, tupleize_cols, thousands, + decimal): head, body, foot = data if head: @@ -630,7 +631,7 @@ def _data_to_frame(data, header, index_col, skiprows, tp = TextParser(body, header=header, index_col=index_col, skiprows=_get_skiprows(skiprows), parse_dates=parse_dates, tupleize_cols=tupleize_cols, - thousands=thousands) + thousands=thousands, decimal=decimal) df = tp.read() return df @@ -716,7 +717,8 @@ def _validate_flavor(flavor): def _parse(flavor, io, match, header, index_col, skiprows, - parse_dates, tupleize_cols, thousands, attrs, encoding): + parse_dates, tupleize_cols, thousands, attrs, encoding, + decimal): flavor = _validate_flavor(flavor) compiled_match = re.compile(match) # you can pass a compiled regex here @@ -744,7 +746,9 @@ def _parse(flavor, io, match, header, index_col, skiprows, skiprows=skiprows, parse_dates=parse_dates, tupleize_cols=tupleize_cols, - thousands=thousands)) + thousands=thousands, + decimal=decimal + )) except EmptyDataError: # empty table continue return ret @@ -752,7 +756,8 @@ def _parse(flavor, io, match, header, index_col, skiprows, def read_html(io, match='.+', flavor=None, header=None, index_col=None, skiprows=None, attrs=None, parse_dates=False, - tupleize_cols=False, thousands=',', encoding=None): + tupleize_cols=False, thousands=',', encoding=None, + decimal='.'): r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters @@ -828,6 +833,12 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, underlying parser library (e.g., the parser library will try to use the encoding provided by the document). + decimal : str, default '.' + Character to recognize as decimal point (e.g. use ',' for European + data). + + .. versionadded:: 0.18.2 + Returns ------- dfs : list of DataFrames @@ -871,4 +882,5 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, 'data (you passed a negative value)') _validate_header_arg(header) return _parse(flavor, io, match, header, index_col, skiprows, - parse_dates, tupleize_cols, thousands, attrs, encoding) + parse_dates, tupleize_cols, thousands, attrs, encoding, + decimal) diff --git a/pandas/io/json.py b/pandas/io/json.py index 08bfd8d7796a0..fd97e51208f7e 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -614,10 +614,12 @@ def nested_to_record(ds, prefix="", level=0): new_d = copy.deepcopy(d) for k, v in d.items(): # each key gets renamed with prefix + if not isinstance(k, compat.string_types): + k = str(k) if level == 0: - newkey = str(k) + newkey = k else: - newkey = prefix + '.' + str(k) + newkey = prefix + '.' + k # only dicts gets recurse-flattend # only at level>1 do we rename the rest of the keys diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f4527df56db88..a851a5f48f5e6 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -55,7 +55,7 @@ Alternative argument name for sep. delim_whitespace : boolean, default False Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be - used as the sep. Equivalent to setting ``sep='\+s'``. If this option + used as the sep. Equivalent to setting ``sep='\s+'``. If this option is set to True, nothing should be passed in for the ``delimiter`` parameter. @@ -73,7 +73,8 @@ rather than the first line of the file. names : array-like, default None List of column names to use. If file contains no header row, then you - should explicitly pass header=None + should explicitly pass header=None. Duplicates in this list are not + allowed unless mangle_dupe_cols=True, which is the default. index_col : int or sequence or False, default None Column to use as the row labels of the DataFrame. If a sequence is given, a MultiIndex is used. If you have a malformed file with delimiters at the end @@ -91,7 +92,9 @@ prefix : str, default None Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ... mangle_dupe_cols : boolean, default True - Duplicate columns will be specified as 'X.0'...'X.N', rather than 'X'...'X' + Duplicate columns will be specified as 'X.0'...'X.N', rather than + 'X'...'X'. Passing in False will cause data to be overwritten if there + are duplicate names in the columns. dtype : Type name or dict of column -> type, default None Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} (Unsupported with engine='python'). Use `str` or `object` to preserve and @@ -189,6 +192,10 @@ Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3). Default (None) results in QUOTE_MINIMAL behavior. +doublequote : boolean, default ``True`` + When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate + whether or not to interpret two consecutive quotechar elements INSIDE a + field as a single ``quotechar`` element. escapechar : str (length 1), default None One-character string used to escape delimiter when quoting is QUOTE_NONE. comment : str, default None @@ -217,6 +224,32 @@ warn_bad_lines : boolean, default True If error_bad_lines is False, and warn_bad_lines is True, a warning for each "bad line" will be output. (Only valid with C parser). +low_memory : boolean, default True + Internally process the file in chunks, resulting in lower memory use + while parsing, but possibly mixed type inference. To ensure no mixed + types either set False, or specify the type with the `dtype` parameter. + Note that the entire file is read into a single DataFrame regardless, + use the `chunksize` or `iterator` parameter to return the data in chunks. + (Only valid with C parser) +buffer_lines : int, default None + DEPRECATED: this argument will be removed in a future version because its + value is not respected by the parser + + If low_memory is True, specify the number of rows to be read for each + chunk. (Only valid with C parser) +compact_ints : boolean, default False + DEPRECATED: this argument will be removed in a future version + + If compact_ints is True, then for any column that is of integer dtype, + the parser will attempt to cast it as the smallest integer dtype possible, + either signed or unsigned depending on the specification from the + `use_unsigned` parameter. +use_unsigned : boolean, default False + DEPRECATED: this argument will be removed in a future version + + If integer columns are being compacted (i.e. `compact_ints=True`), specify + whether the column should be compacted to the smallest signed or unsigned + integer dtype. Returns ------- @@ -269,6 +302,26 @@ """ % (_parser_params % (_fwf_widths, '')) +def _validate_nrows(nrows): + """ + Checks whether the 'nrows' parameter for parsing is either + an integer OR float that can SAFELY be cast to an integer + without losing accuracy. Raises a ValueError if that is + not the case. + """ + msg = "'nrows' must be an integer" + + if nrows is not None: + if com.is_float(nrows): + if int(nrows) != nrows: + raise ValueError(msg) + nrows = int(nrows) + elif not com.is_integer(nrows): + raise ValueError(msg) + + return nrows + + def _read(filepath_or_buffer, kwds): "Generic reader of line files." encoding = kwds.get('encoding', None) @@ -308,14 +361,14 @@ def _read(filepath_or_buffer, kwds): # Extract some of the arguments (pass chunksize on). iterator = kwds.get('iterator', False) - nrows = kwds.pop('nrows', None) chunksize = kwds.get('chunksize', None) + nrows = _validate_nrows(kwds.pop('nrows', None)) # Create the parser. parser = TextFileReader(filepath_or_buffer, **kwds) if (nrows is not None) and (chunksize is not None): - raise NotImplementedError("'nrows' and 'chunksize' can not be used" + raise NotImplementedError("'nrows' and 'chunksize' cannot be used" " together yet.") elif nrows is not None: return parser.read(nrows) @@ -348,6 +401,7 @@ def _read(filepath_or_buffer, kwds): 'keep_default_na': True, 'thousands': None, 'comment': None, + 'decimal': b'.', # 'engine': 'c', 'parse_dates': False, @@ -383,7 +437,6 @@ def _read(filepath_or_buffer, kwds): 'error_bad_lines': True, 'warn_bad_lines': True, 'dtype': None, - 'decimal': b'.', 'float_precision': None } @@ -395,18 +448,19 @@ def _read(filepath_or_buffer, kwds): _c_unsupported = set(['skip_footer']) _python_unsupported = set([ 'as_recarray', - 'na_filter', - 'compact_ints', - 'use_unsigned', 'low_memory', 'memory_map', 'buffer_lines', 'error_bad_lines', 'warn_bad_lines', 'dtype', - 'decimal', 'float_precision', ]) +_deprecated_args = set([ + 'buffer_lines', + 'compact_ints', + 'use_unsigned', +]) def _make_parser_function(name, sep=','): @@ -656,7 +710,14 @@ def _get_options_with_defaults(self, engine): options = {} for argname, default in compat.iteritems(_parser_defaults): - options[argname] = kwds.get(argname, default) + value = kwds.get(argname, default) + + # see gh-12935 + if argname == 'mangle_dupe_cols' and not value: + raise ValueError('Setting mangle_dupe_cols=False is ' + 'not supported yet') + else: + options[argname] = value for argname, default in compat.iteritems(_c_parser_defaults): if argname in kwds: @@ -754,6 +815,13 @@ def _clean_options(self, options, engine): _validate_header_arg(options['header']) + for arg in _deprecated_args: + parser_default = _c_parser_defaults[arg] + if result.get(arg, parser_default) != parser_default: + warnings.warn("The '{arg}' argument has been deprecated " + "and will be removed in a future version" + .format(arg=arg), FutureWarning, stacklevel=2) + if index_col is True: raise ValueError("The value of index_col couldn't be 'True'") if _is_index_col(index_col): @@ -847,12 +915,13 @@ def _validate_usecols_arg(usecols): or strings (column by name). Raises a ValueError if that is not the case. """ + msg = ("The elements of 'usecols' must " + "either be all strings, all unicode, or all integers") + if usecols is not None: usecols_dtype = lib.infer_dtype(usecols) - if usecols_dtype not in ('integer', 'string'): - raise ValueError(("The elements of 'usecols' " - "must either be all strings " - "or all integers")) + if usecols_dtype not in ('integer', 'string', 'unicode'): + raise ValueError(msg) return usecols @@ -900,6 +969,7 @@ def __init__(self, kwds): self.true_values = kwds.get('true_values') self.false_values = kwds.get('false_values') self.tupleize_cols = kwds.get('tupleize_cols', False) + self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True) self.infer_datetime_format = kwds.pop('infer_datetime_format', False) self._date_conv = _make_date_converter( @@ -1013,6 +1083,26 @@ def tostr(x): return names, index_names, col_names, passed_names + def _maybe_dedup_names(self, names): + # see gh-7160 and gh-9424: this helps to provide + # immediate alleviation of the duplicate names + # issue and appears to be satisfactory to users, + # but ultimately, not needing to butcher the names + # would be nice! + if self.mangle_dupe_cols: + names = list(names) # so we can index + counts = {} + + for i, col in enumerate(names): + cur_count = counts.get(col, 0) + + if cur_count > 0: + names[i] = '%s.%d' % (col, cur_count) + + counts[col] = cur_count + 1 + + return names + def _maybe_make_multi_index_columns(self, columns, col_names=None): # possibly create a column mi here if (not self.tupleize_cols and len(columns) and @@ -1131,8 +1221,13 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, result = {} for c, values in compat.iteritems(dct): conv_f = None if converters is None else converters.get(c, None) - col_na_values, col_na_fvalues = _get_na_values(c, na_values, - na_fvalues) + + if self.na_filter: + col_na_values, col_na_fvalues = _get_na_values( + c, na_values, na_fvalues) + else: + col_na_values, col_na_fvalues = set(), set() + coerce_type = True if conv_f is not None: try: @@ -1144,6 +1239,12 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, cvals, na_count = self._convert_types( values, set(col_na_values) | col_na_fvalues, coerce_type) + + if issubclass(cvals.dtype.type, np.integer) and self.compact_ints: + cvals = lib.downcast_int64( + cvals, _parser.na_values, + self.use_unsigned) + result[c] = cvals if verbose and na_count: print('Filled %d NA values in column %s' % (na_count, str(c))) @@ -1315,10 +1416,11 @@ def read(self, nrows=None): except StopIteration: if self._first_chunk: self._first_chunk = False + names = self._maybe_dedup_names(self.orig_names) index, columns, col_dict = _get_empty_meta( - self.orig_names, self.index_col, - self.index_names, dtype=self.kwds.get('dtype')) + names, self.index_col, self.index_names, + dtype=self.kwds.get('dtype')) if self.usecols is not None: columns = self._filter_usecols(columns) @@ -1362,6 +1464,8 @@ def read(self, nrows=None): if self.usecols is not None: names = self._filter_usecols(names) + names = self._maybe_dedup_names(names) + # rename dict keys data = sorted(data.items()) data = dict((k, v) for k, (i, v) in zip(names, data)) @@ -1374,6 +1478,7 @@ def read(self, nrows=None): # ugh, mutation names = list(self.orig_names) + names = self._maybe_dedup_names(names) if self.usecols is not None: names = self._filter_usecols(names) @@ -1568,12 +1673,13 @@ def __init__(self, f, **kwds): self.skipinitialspace = kwds['skipinitialspace'] self.lineterminator = kwds['lineterminator'] self.quoting = kwds['quoting'] - self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True) self.usecols = _validate_usecols_arg(kwds['usecols']) self.skip_blank_lines = kwds['skip_blank_lines'] self.names_passed = kwds['names'] or None + self.na_filter = kwds['na_filter'] + self.has_index_names = False if 'has_index_names' in kwds: self.has_index_names = kwds['has_index_names'] @@ -1581,7 +1687,11 @@ def __init__(self, f, **kwds): self.verbose = kwds['verbose'] self.converters = kwds['converters'] + self.compact_ints = kwds['compact_ints'] + self.use_unsigned = kwds['use_unsigned'] self.thousands = kwds['thousands'] + self.decimal = kwds['decimal'] + self.comment = kwds['comment'] self._comment_lines = [] @@ -1639,6 +1749,15 @@ def __init__(self, f, **kwds): else: self._no_thousands_columns = None + if len(self.decimal) != 1: + raise ValueError('Only length-1 decimal markers supported') + + if self.thousands is None: + self.nonnum = re.compile('[^-^0-9^%s]+' % self.decimal) + else: + self.nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands, + self.decimal)) + def _set_no_thousands_columns(self): # Create a set of column ids that are not to be stripped of thousands # operators. @@ -1747,8 +1866,8 @@ def read(self, rows=None): columns = list(self.orig_names) if not len(content): # pragma: no cover # DataFrame with the right metadata, even though it's length 0 - return _get_empty_meta(self.orig_names, - self.index_col, + names = self._maybe_dedup_names(self.orig_names) + return _get_empty_meta(names, self.index_col, self.index_names) # handle new style for names in index @@ -1761,7 +1880,8 @@ def read(self, rows=None): alldata = self._rows_to_cols(content) data = self._exclude_implicit_index(alldata) - columns, data = self._do_date_conversions(self.columns, data) + columns = self._maybe_dedup_names(self.columns) + columns, data = self._do_date_conversions(columns, data) data = self._convert_data(data) index, columns = self._make_index(data, alldata, columns, indexnamerow) @@ -1769,18 +1889,19 @@ def read(self, rows=None): return index, columns, data def _exclude_implicit_index(self, alldata): + names = self._maybe_dedup_names(self.orig_names) if self._implicit_index: excl_indices = self.index_col data = {} offset = 0 - for i, col in enumerate(self.orig_names): + for i, col in enumerate(names): while i + offset in excl_indices: offset += 1 data[col] = alldata[i + offset] else: - data = dict((k, v) for k, v in zip(self.orig_names, alldata)) + data = dict((k, v) for k, v in zip(names, alldata)) return data @@ -2050,22 +2171,35 @@ def _check_empty(self, lines): def _check_thousands(self, lines): if self.thousands is None: return lines - nonnum = re.compile('[^-^0-9^%s^.]+' % self.thousands) + + return self._search_replace_num_columns(lines=lines, + search=self.thousands, + replace='') + + def _search_replace_num_columns(self, lines, search, replace): ret = [] for l in lines: rl = [] for i, x in enumerate(l): if (not isinstance(x, compat.string_types) or - self.thousands not in x or + search not in x or (self._no_thousands_columns and i in self._no_thousands_columns) or - nonnum.search(x.strip())): + self.nonnum.search(x.strip())): rl.append(x) else: - rl.append(x.replace(self.thousands, '')) + rl.append(x.replace(search, replace)) ret.append(rl) return ret + def _check_decimal(self, lines): + if self.decimal == _parser_defaults['decimal']: + return lines + + return self._search_replace_num_columns(lines=lines, + search=self.decimal, + replace='.') + def _clear_buffer(self): self.buf = [] @@ -2135,14 +2269,16 @@ def _get_index_name(self, columns): return index_name, orig_names, columns def _rows_to_cols(self, content): - zipped_content = list(lib.to_object_array(content).T) - col_len = self.num_original_columns - zip_len = len(zipped_content) if self._implicit_index: col_len += len(self.index_col) + # see gh-13320 + zipped_content = list(lib.to_object_array( + content, min_width=col_len).T) + zip_len = len(zipped_content) + if self.skip_footer < 0: raise ValueError('skip footer cannot be negative') @@ -2249,7 +2385,8 @@ def _get_lines(self, rows=None): lines = self._check_comments(lines) if self.skip_blank_lines: lines = self._check_empty(lines) - return self._check_thousands(lines) + lines = self._check_thousands(lines) + return self._check_decimal(lines) def _make_date_converter(date_parser=None, dayfirst=False, diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index dff2c6f0df7b1..cbe04349b5105 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -13,10 +13,12 @@ import os import numpy as np + import pandas as pd from pandas import (Series, DataFrame, Panel, Panel4D, Index, MultiIndex, Int64Index) from pandas.core import config +from pandas.io.common import _stringify_path from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel from pandas.sparse.array import BlockIndex, IntIndex from pandas.tseries.api import PeriodIndex, DatetimeIndex @@ -254,6 +256,7 @@ def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, else: f = lambda store: store.put(key, value, **kwargs) + path_or_buf = _stringify_path(path_or_buf) if isinstance(path_or_buf, string_types): with HDFStore(path_or_buf, mode=mode, complevel=complevel, complib=complib) as store: @@ -270,7 +273,11 @@ def read_hdf(path_or_buf, key=None, **kwargs): Parameters ---------- - path_or_buf : path (string), or buffer to read from + path_or_buf : path (string), buffer, or path object (pathlib.Path or + py._path.local.LocalPath) to read from + + .. versionadded:: 0.18.2 support for pathlib, py.path. + key : group identifier in the store. Can be omitted a HDF file contains a single pandas object. where : list of Term (or convertable) objects, optional @@ -293,6 +300,7 @@ def read_hdf(path_or_buf, key=None, **kwargs): if 'where' in kwargs: kwargs['where'] = _ensure_term(kwargs['where'], scope_level=1) + path_or_buf = _stringify_path(path_or_buf) if isinstance(path_or_buf, string_types): try: @@ -316,17 +324,27 @@ def read_hdf(path_or_buf, key=None, **kwargs): store = path_or_buf auto_close = False + else: raise NotImplementedError('Support for generic buffers has not been ' 'implemented.') try: if key is None: - keys = store.keys() - if len(keys) != 1: - raise ValueError('key must be provided when HDF file contains ' - 'multiple datasets.') - key = keys[0] + groups = store.groups() + if len(groups) == 0: + raise ValueError('No dataset in HDF5 file.') + candidate_only_group = groups[0] + + # For the HDF file to have only one dataset, all other groups + # should then be metadata groups for that candidate group. (This + # assumes that the groups() method enumerates parent groups + # before their children.) + for group_to_check in groups[1:]: + if not _is_metadata_of(group_to_check, candidate_only_group): + raise ValueError('key must be provided when HDF5 file ' + 'contains multiple datasets.') + key = candidate_only_group._v_pathname return store.select(key, auto_close=auto_close, **kwargs) except: # if there is an error, close the store @@ -338,6 +356,20 @@ def read_hdf(path_or_buf, key=None, **kwargs): raise +def _is_metadata_of(group, parent_group): + """Check if a given group is a metadata group for a given parent_group.""" + if group._v_depth <= parent_group._v_depth: + return False + + current = group + while current._v_depth > 1: + parent = current._v_parent + if parent == parent_group and current._v_name == 'meta': + return True + current = current._v_parent + return False + + class HDFStore(StringMixin): """ @@ -1305,12 +1337,20 @@ def __init__(self, store, s, func, where, nrows, start=None, stop=None, self.s = s self.func = func self.where = where - self.nrows = nrows or 0 - self.start = start or 0 - if stop is None: - stop = self.nrows - self.stop = min(self.nrows, stop) + # set start/stop if they are not set if we are a table + if self.s.is_table: + if nrows is None: + nrows = 0 + if start is None: + start = 0 + if stop is None: + stop = nrows + stop = min(nrows, stop) + + self.nrows = nrows + self.start = start + self.stop = stop self.coordinates = None if iterator or chunksize is not None: @@ -2294,14 +2334,23 @@ def f(values, freq=None, tz=None): return klass def validate_read(self, kwargs): - if kwargs.get('columns') is not None: + """ + remove table keywords from kwargs and return + raise if any keywords are passed which are not-None + """ + kwargs = copy.copy(kwargs) + + columns = kwargs.pop('columns', None) + if columns is not None: raise TypeError("cannot pass a column specification when reading " "a Fixed format store. this store must be " "selected in its entirety") - if kwargs.get('where') is not None: + where = kwargs.pop('where', None) + if where is not None: raise TypeError("cannot pass a where specification when reading " "from a Fixed format store. this store must be " "selected in its entirety") + return kwargs @property def is_exists(self): @@ -2320,11 +2369,11 @@ def get_attrs(self): def write(self, obj, **kwargs): self.set_attrs() - def read_array(self, key): + def read_array(self, key, start=None, stop=None): """ read an array for the specified node (off of group """ import tables node = getattr(self.group, key) - data = node[:] + data = node[start:stop] attrs = node._v_attrs transposed = getattr(attrs, 'transposed', False) @@ -2354,17 +2403,17 @@ def read_array(self, key): else: return ret - def read_index(self, key): + def read_index(self, key, **kwargs): variety = _ensure_decoded(getattr(self.attrs, '%s_variety' % key)) if variety == u('multi'): - return self.read_multi_index(key) + return self.read_multi_index(key, **kwargs) elif variety == u('block'): - return self.read_block_index(key) + return self.read_block_index(key, **kwargs) elif variety == u('sparseint'): - return self.read_sparse_intindex(key) + return self.read_sparse_intindex(key, **kwargs) elif variety == u('regular'): - _, index = self.read_index_node(getattr(self.group, key)) + _, index = self.read_index_node(getattr(self.group, key), **kwargs) return index else: # pragma: no cover raise TypeError('unrecognized index variety: %s' % variety) @@ -2402,19 +2451,19 @@ def write_block_index(self, key, index): self.write_array('%s_blengths' % key, index.blengths) setattr(self.attrs, '%s_length' % key, index.length) - def read_block_index(self, key): + def read_block_index(self, key, **kwargs): length = getattr(self.attrs, '%s_length' % key) - blocs = self.read_array('%s_blocs' % key) - blengths = self.read_array('%s_blengths' % key) + blocs = self.read_array('%s_blocs' % key, **kwargs) + blengths = self.read_array('%s_blengths' % key, **kwargs) return BlockIndex(length, blocs, blengths) def write_sparse_intindex(self, key, index): self.write_array('%s_indices' % key, index.indices) setattr(self.attrs, '%s_length' % key, index.length) - def read_sparse_intindex(self, key): + def read_sparse_intindex(self, key, **kwargs): length = getattr(self.attrs, '%s_length' % key) - indices = self.read_array('%s_indices' % key) + indices = self.read_array('%s_indices' % key, **kwargs) return IntIndex(length, indices) def write_multi_index(self, key, index): @@ -2439,7 +2488,7 @@ def write_multi_index(self, key, index): label_key = '%s_label%d' % (key, i) self.write_array(label_key, lab) - def read_multi_index(self, key): + def read_multi_index(self, key, **kwargs): nlevels = getattr(self.attrs, '%s_nlevels' % key) levels = [] @@ -2447,19 +2496,20 @@ def read_multi_index(self, key): names = [] for i in range(nlevels): level_key = '%s_level%d' % (key, i) - name, lev = self.read_index_node(getattr(self.group, level_key)) + name, lev = self.read_index_node(getattr(self.group, level_key), + **kwargs) levels.append(lev) names.append(name) label_key = '%s_label%d' % (key, i) - lab = self.read_array(label_key) + lab = self.read_array(label_key, **kwargs) labels.append(lab) return MultiIndex(levels=levels, labels=labels, names=names, verify_integrity=True) - def read_index_node(self, node): - data = node[:] + def read_index_node(self, node, start=None, stop=None): + data = node[start:stop] # If the index was an empty array write_array_empty() will # have written a sentinel. Here we relace it with the original. if ('shape' in node._v_attrs and @@ -2598,9 +2648,9 @@ def write_array(self, key, value, items=None): class LegacyFixed(GenericFixed): - def read_index_legacy(self, key): + def read_index_legacy(self, key, start=None, stop=None): node = getattr(self.group, key) - data = node[:] + data = node[start:stop] kind = node._v_attrs.kind return _unconvert_index_legacy(data, kind, encoding=self.encoding) @@ -2608,7 +2658,7 @@ def read_index_legacy(self, key): class LegacySeriesFixed(LegacyFixed): def read(self, **kwargs): - self.validate_read(kwargs) + kwargs = self.validate_read(kwargs) index = self.read_index_legacy('index') values = self.read_array('values') return Series(values, index=index) @@ -2617,7 +2667,7 @@ def read(self, **kwargs): class LegacyFrameFixed(LegacyFixed): def read(self, **kwargs): - self.validate_read(kwargs) + kwargs = self.validate_read(kwargs) index = self.read_index_legacy('index') columns = self.read_index_legacy('columns') values = self.read_array('values') @@ -2636,9 +2686,9 @@ def shape(self): return None def read(self, **kwargs): - self.validate_read(kwargs) - index = self.read_index('index') - values = self.read_array('values') + kwargs = self.validate_read(kwargs) + index = self.read_index('index', **kwargs) + values = self.read_array('values', **kwargs) return Series(values, index=index, name=self.name) def write(self, obj, **kwargs): @@ -2648,12 +2698,25 @@ def write(self, obj, **kwargs): self.attrs.name = obj.name -class SparseSeriesFixed(GenericFixed): +class SparseFixed(GenericFixed): + + def validate_read(self, kwargs): + """ + we don't support start, stop kwds in Sparse + """ + kwargs = super(SparseFixed, self).validate_read(kwargs) + if 'start' in kwargs or 'stop' in kwargs: + raise NotImplementedError("start and/or stop are not supported " + "in fixed Sparse reading") + return kwargs + + +class SparseSeriesFixed(SparseFixed): pandas_kind = u('sparse_series') attributes = ['name', 'fill_value', 'kind'] def read(self, **kwargs): - self.validate_read(kwargs) + kwargs = self.validate_read(kwargs) index = self.read_index('index') sp_values = self.read_array('sp_values') sp_index = self.read_index('sp_index') @@ -2672,12 +2735,12 @@ def write(self, obj, **kwargs): self.attrs.kind = obj.kind -class SparseFrameFixed(GenericFixed): +class SparseFrameFixed(SparseFixed): pandas_kind = u('sparse_frame') attributes = ['default_kind', 'default_fill_value'] def read(self, **kwargs): - self.validate_read(kwargs) + kwargs = self.validate_read(kwargs) columns = self.read_index('columns') sdict = {} for c in columns: @@ -2705,12 +2768,12 @@ def write(self, obj, **kwargs): self.write_index('columns', obj.columns) -class SparsePanelFixed(GenericFixed): +class SparsePanelFixed(SparseFixed): pandas_kind = u('sparse_panel') attributes = ['default_kind', 'default_fill_value'] def read(self, **kwargs): - self.validate_read(kwargs) + kwargs = self.validate_read(kwargs) items = self.read_index('items') sdict = {} @@ -2773,19 +2836,26 @@ def shape(self): except: return None - def read(self, **kwargs): - self.validate_read(kwargs) + def read(self, start=None, stop=None, **kwargs): + # start, stop applied to rows, so 0th axis only + + kwargs = self.validate_read(kwargs) + select_axis = self.obj_type()._get_block_manager_axis(0) axes = [] for i in range(self.ndim): - ax = self.read_index('axis%d' % i) + + _start, _stop = (start, stop) if i == select_axis else (None, None) + ax = self.read_index('axis%d' % i, start=_start, stop=_stop) axes.append(ax) items = axes[0] blocks = [] for i in range(self.nblocks): + blk_items = self.read_index('block%d_items' % i) - values = self.read_array('block%d_values' % i) + values = self.read_array('block%d_values' % i, + start=_start, stop=_stop) blk = make_block(values, placement=items.get_indexer(blk_items)) blocks.append(blk) @@ -3826,24 +3896,24 @@ def write_data(self, chunksize, dropna=False): nrows = self.nrows_expected # if dropna==True, then drop ALL nan rows + masks = [] if dropna: - masks = [] for a in self.values_axes: # figure the mask: only do if we can successfully process this # column, otherwise ignore the mask mask = com.isnull(a.data).all(axis=0) - masks.append(mask.astype('u1', copy=False)) + if isinstance(mask, np.ndarray): + masks.append(mask.astype('u1', copy=False)) - # consolidate masks + # consolidate masks + if len(masks): mask = masks[0] for m in masks[1:]: mask = mask & m mask = mask.ravel() - else: - mask = None # broadcast the indexes if needed diff --git a/pandas/io/s3.py b/pandas/io/s3.py new file mode 100644 index 0000000000000..df8f1d9187031 --- /dev/null +++ b/pandas/io/s3.py @@ -0,0 +1,112 @@ +""" s3 support for remote file interactivity """ + +import os +from pandas import compat +from pandas.compat import BytesIO + +try: + import boto + from boto.s3 import key +except: + raise ImportError("boto is required to handle s3 files") + +if compat.PY3: + from urllib.parse import urlparse as parse_url +else: + from urlparse import urlparse as parse_url + + +class BotoFileLikeReader(key.Key): + """boto Key modified to be more file-like + + This modification of the boto Key will read through a supplied + S3 key once, then stop. The unmodified boto Key object will repeatedly + cycle through a file in S3: after reaching the end of the file, + boto will close the file. Then the next call to `read` or `next` will + re-open the file and start reading from the beginning. + + Also adds a `readline` function which will split the returned + values by the `\n` character. + """ + + def __init__(self, *args, **kwargs): + encoding = kwargs.pop("encoding", None) # Python 2 compat + super(BotoFileLikeReader, self).__init__(*args, **kwargs) + # Add a flag to mark the end of the read. + self.finished_read = False + self.buffer = "" + self.lines = [] + if encoding is None and compat.PY3: + encoding = "utf-8" + self.encoding = encoding + self.lines = [] + + def next(self): + return self.readline() + + __next__ = next + + def read(self, *args, **kwargs): + if self.finished_read: + return b'' if compat.PY3 else '' + return super(BotoFileLikeReader, self).read(*args, **kwargs) + + def close(self, *args, **kwargs): + self.finished_read = True + return super(BotoFileLikeReader, self).close(*args, **kwargs) + + def seekable(self): + """Needed for reading by bz2""" + return False + + def readline(self): + """Split the contents of the Key by '\n' characters.""" + if self.lines: + retval = self.lines[0] + self.lines = self.lines[1:] + return retval + if self.finished_read: + if self.buffer: + retval, self.buffer = self.buffer, "" + return retval + else: + raise StopIteration + + if self.encoding: + self.buffer = "{}{}".format( + self.buffer, self.read(8192).decode(self.encoding)) + else: + self.buffer = "{}{}".format(self.buffer, self.read(8192)) + + split_buffer = self.buffer.split("\n") + self.lines.extend(split_buffer[:-1]) + self.buffer = split_buffer[-1] + + return self.readline() + + +def get_filepath_or_buffer(filepath_or_buffer, encoding=None, + compression=None): + + # Assuming AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY and AWS_S3_HOST + # are environment variables + parsed_url = parse_url(filepath_or_buffer) + s3_host = os.environ.get('AWS_S3_HOST', 's3.amazonaws.com') + + try: + conn = boto.connect_s3(host=s3_host) + except boto.exception.NoAuthHandlerFound: + conn = boto.connect_s3(host=s3_host, anon=True) + + b = conn.get_bucket(parsed_url.netloc, validate=False) + if compat.PY2 and (compression == 'gzip' or + (compression == 'infer' and + filepath_or_buffer.endswith(".gz"))): + k = boto.s3.key.Key(b, parsed_url.path) + filepath_or_buffer = BytesIO(k.get_contents_as_string( + encoding=encoding)) + else: + k = BotoFileLikeReader(b, parsed_url.path, encoding=encoding) + k.open('r') # Expose read errors immediately + filepath_or_buffer = k + return filepath_or_buffer, None, compression diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 6c6e11a53d2d3..ae7200cf6fb2e 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -89,12 +89,14 @@ Examples -------- Read a Stata dta file: ->> df = pandas.read_stata('filename.dta') + +>>> df = pandas.read_stata('filename.dta') Read a Stata dta file in 10,000 line chunks: ->> itr = pandas.read_stata('filename.dta', chunksize=10000) ->> for chunk in itr: ->> do_something(chunk) + +>>> itr = pandas.read_stata('filename.dta', chunksize=10000) +>>> for chunk in itr: +>>> do_something(chunk) """ % (_statafile_processing_params1, _encoding_params, _statafile_processing_params2, _chunksize_params, _iterator_params) diff --git a/pandas/io/tests/json/test_json_norm.py b/pandas/io/tests/json/test_json_norm.py index 81a1fecbdebac..4848db97194d9 100644 --- a/pandas/io/tests/json/test_json_norm.py +++ b/pandas/io/tests/json/test_json_norm.py @@ -2,8 +2,10 @@ from pandas import DataFrame import numpy as np +import json import pandas.util.testing as tm +from pandas import compat from pandas.io.json import json_normalize, nested_to_record @@ -164,6 +166,26 @@ def test_record_prefix(self): tm.assert_frame_equal(result, expected) + def test_non_ascii_key(self): + if compat.PY3: + testjson = ( + b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},' + + b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]' + ).decode('utf8') + else: + testjson = ('[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},' + '{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]') + + testdata = { + u'sub.A': [1, 3], + u'sub.B': [2, 4], + b"\xc3\x9cnic\xc3\xb8de".decode('utf8'): [0, 1] + } + expected = DataFrame(testdata) + + result = json_normalize(json.loads(testjson)) + tm.assert_frame_equal(result, expected) + class TestNestedToRecord(tm.TestCase): diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index 6fe559e5cacd8..9f8aedc2e399e 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -87,7 +87,7 @@ def test_frame_double_encoded_labels(self): orient='index')) df_unser = read_json(df.to_json(orient='records'), orient='records') assert_index_equal(df.columns, df_unser.columns) - np.testing.assert_equal(df.values, df_unser.values) + tm.assert_numpy_array_equal(df.values, df_unser.values) def test_frame_non_unique_index(self): df = DataFrame([['a', 'b'], ['c', 'd']], index=[1, 1], @@ -99,10 +99,10 @@ def test_frame_non_unique_index(self): assert_frame_equal(df, read_json(df.to_json(orient='split'), orient='split')) unser = read_json(df.to_json(orient='records'), orient='records') - self.assertTrue(df.columns.equals(unser.columns)) - np.testing.assert_equal(df.values, unser.values) + self.assert_index_equal(df.columns, unser.columns) + tm.assert_almost_equal(df.values, unser.values) unser = read_json(df.to_json(orient='values'), orient='values') - np.testing.assert_equal(df.values, unser.values) + tm.assert_numpy_array_equal(df.values, unser.values) def test_frame_non_unique_columns(self): df = DataFrame([['a', 'b'], ['c', 'd']], index=[1, 2], @@ -115,7 +115,7 @@ def test_frame_non_unique_columns(self): assert_frame_equal(df, read_json(df.to_json(orient='split'), orient='split', dtype=False)) unser = read_json(df.to_json(orient='values'), orient='values') - np.testing.assert_equal(df.values, unser.values) + tm.assert_numpy_array_equal(df.values, unser.values) # GH4377; duplicate columns not processing correctly df = DataFrame([['a', 'b'], ['c', 'd']], index=[ @@ -183,7 +183,8 @@ def _check_orient(df, orient, dtype=None, numpy=False, # index is not captured in this orientation assert_almost_equal(df.values, unser.values, check_dtype=check_numpy_dtype) - self.assertTrue(df.columns.equals(unser.columns)) + self.assert_index_equal(df.columns, unser.columns, + exact=check_column_type) elif orient == "values": # index and cols are not captured in this orientation if numpy is True and df.shape == (0, 0): @@ -302,12 +303,10 @@ def _check_all_orients(df, dtype=None, convert_axes=True, # mixed data index = pd.Index(['a', 'b', 'c', 'd', 'e']) - data = { - 'A': [0., 1., 2., 3., 4.], - 'B': [0., 1., 0., 1., 0.], - 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], - 'D': [True, False, True, False, True] - } + data = {'A': [0., 1., 2., 3., 4.], + 'B': [0., 1., 0., 1., 0.], + 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], + 'D': [True, False, True, False, True]} df = DataFrame(data=data, index=index) _check_orient(df, "split", check_dtype=False) _check_orient(df, "records", check_dtype=False) @@ -487,7 +486,7 @@ def test_series_non_unique_index(self): orient='split', typ='series')) unser = read_json(s.to_json(orient='records'), orient='records', typ='series') - np.testing.assert_equal(s.values, unser.values) + tm.assert_numpy_array_equal(s.values, unser.values) def test_series_from_json_to_json(self): diff --git a/pandas/io/tests/json/test_ujson.py b/pandas/io/tests/json/test_ujson.py index babcd910a2edd..13b2dafec9c89 100644 --- a/pandas/io/tests/json/test_ujson.py +++ b/pandas/io/tests/json/test_ujson.py @@ -21,8 +21,6 @@ import pandas.compat as compat import numpy as np -from numpy.testing import (assert_array_almost_equal_nulp, - assert_approx_equal) from pandas import DataFrame, Series, Index, NaT, DatetimeIndex import pandas.util.testing as tm @@ -1015,19 +1013,19 @@ def testFloatArray(self): inpt = arr.astype(dtype) outp = np.array(ujson.decode(ujson.encode( inpt, double_precision=15)), dtype=dtype) - assert_array_almost_equal_nulp(inpt, outp) + tm.assert_almost_equal(inpt, outp) def testFloatMax(self): num = np.float(np.finfo(np.float).max / 10) - assert_approx_equal(np.float(ujson.decode( + tm.assert_almost_equal(np.float(ujson.decode( ujson.encode(num, double_precision=15))), num, 15) num = np.float32(np.finfo(np.float32).max / 10) - assert_approx_equal(np.float32(ujson.decode( + tm.assert_almost_equal(np.float32(ujson.decode( ujson.encode(num, double_precision=15))), num, 15) num = np.float64(np.finfo(np.float64).max / 10) - assert_approx_equal(np.float64(ujson.decode( + tm.assert_almost_equal(np.float64(ujson.decode( ujson.encode(num, double_precision=15))), num, 15) def testArrays(self): @@ -1067,9 +1065,9 @@ def testArrays(self): arr = np.arange(100.202, 200.202, 1, dtype=np.float32) arr = arr.reshape((5, 5, 4)) outp = np.array(ujson.decode(ujson.encode(arr)), dtype=np.float32) - assert_array_almost_equal_nulp(arr, outp) + tm.assert_almost_equal(arr, outp) outp = ujson.decode(ujson.encode(arr), numpy=True, dtype=np.float32) - assert_array_almost_equal_nulp(arr, outp) + tm.assert_almost_equal(arr, outp) def testOdArray(self): def will_raise(): @@ -1203,19 +1201,19 @@ def testDataFrame(self): # column indexed outp = DataFrame(ujson.decode(ujson.encode(df))) self.assertTrue((df == outp).values.all()) - tm.assert_numpy_array_equal(df.columns, outp.columns) - tm.assert_numpy_array_equal(df.index, outp.index) + tm.assert_index_equal(df.columns, outp.columns) + tm.assert_index_equal(df.index, outp.index) dec = _clean_dict(ujson.decode(ujson.encode(df, orient="split"))) outp = DataFrame(**dec) self.assertTrue((df == outp).values.all()) - tm.assert_numpy_array_equal(df.columns, outp.columns) - tm.assert_numpy_array_equal(df.index, outp.index) + tm.assert_index_equal(df.columns, outp.columns) + tm.assert_index_equal(df.index, outp.index) outp = DataFrame(ujson.decode(ujson.encode(df, orient="records"))) outp.index = df.index self.assertTrue((df == outp).values.all()) - tm.assert_numpy_array_equal(df.columns, outp.columns) + tm.assert_index_equal(df.columns, outp.columns) outp = DataFrame(ujson.decode(ujson.encode(df, orient="values"))) outp.index = df.index @@ -1223,8 +1221,8 @@ def testDataFrame(self): outp = DataFrame(ujson.decode(ujson.encode(df, orient="index"))) self.assertTrue((df.transpose() == outp).values.all()) - tm.assert_numpy_array_equal(df.transpose().columns, outp.columns) - tm.assert_numpy_array_equal(df.transpose().index, outp.index) + tm.assert_index_equal(df.transpose().columns, outp.columns) + tm.assert_index_equal(df.transpose().index, outp.index) def testDataFrameNumpy(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ @@ -1233,21 +1231,21 @@ def testDataFrameNumpy(self): # column indexed outp = DataFrame(ujson.decode(ujson.encode(df), numpy=True)) self.assertTrue((df == outp).values.all()) - tm.assert_numpy_array_equal(df.columns, outp.columns) - tm.assert_numpy_array_equal(df.index, outp.index) + tm.assert_index_equal(df.columns, outp.columns) + tm.assert_index_equal(df.index, outp.index) dec = _clean_dict(ujson.decode(ujson.encode(df, orient="split"), numpy=True)) outp = DataFrame(**dec) self.assertTrue((df == outp).values.all()) - tm.assert_numpy_array_equal(df.columns, outp.columns) - tm.assert_numpy_array_equal(df.index, outp.index) + tm.assert_index_equal(df.columns, outp.columns) + tm.assert_index_equal(df.index, outp.index) - outp = DataFrame(ujson.decode( - ujson.encode(df, orient="index"), numpy=True)) + outp = DataFrame(ujson.decode(ujson.encode(df, orient="index"), + numpy=True)) self.assertTrue((df.transpose() == outp).values.all()) - tm.assert_numpy_array_equal(df.transpose().columns, outp.columns) - tm.assert_numpy_array_equal(df.transpose().index, outp.index) + tm.assert_index_equal(df.transpose().columns, outp.columns) + tm.assert_index_equal(df.transpose().index, outp.index) def testDataFrameNested(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ @@ -1287,20 +1285,20 @@ def testDataFrameNumpyLabelled(self): outp = DataFrame(*ujson.decode(ujson.encode(df), numpy=True, labelled=True)) self.assertTrue((df.T == outp).values.all()) - tm.assert_numpy_array_equal(df.T.columns, outp.columns) - tm.assert_numpy_array_equal(df.T.index, outp.index) + tm.assert_index_equal(df.T.columns, outp.columns) + tm.assert_index_equal(df.T.index, outp.index) outp = DataFrame(*ujson.decode(ujson.encode(df, orient="records"), numpy=True, labelled=True)) outp.index = df.index self.assertTrue((df == outp).values.all()) - tm.assert_numpy_array_equal(df.columns, outp.columns) + tm.assert_index_equal(df.columns, outp.columns) outp = DataFrame(*ujson.decode(ujson.encode(df, orient="index"), numpy=True, labelled=True)) self.assertTrue((df == outp).values.all()) - tm.assert_numpy_array_equal(df.columns, outp.columns) - tm.assert_numpy_array_equal(df.index, outp.index) + tm.assert_index_equal(df.columns, outp.columns) + tm.assert_index_equal(df.index, outp.index) def testSeries(self): s = Series([10, 20, 30, 40, 50, 60], name="series", @@ -1380,42 +1378,46 @@ def testIndex(self): i = Index([23, 45, 18, 98, 43, 11], name="index") # column indexed - outp = Index(ujson.decode(ujson.encode(i))) - self.assertTrue(i.equals(outp)) + outp = Index(ujson.decode(ujson.encode(i)), name='index') + tm.assert_index_equal(i, outp) - outp = Index(ujson.decode(ujson.encode(i), numpy=True)) - self.assertTrue(i.equals(outp)) + outp = Index(ujson.decode(ujson.encode(i), numpy=True), name='index') + tm.assert_index_equal(i, outp) dec = _clean_dict(ujson.decode(ujson.encode(i, orient="split"))) outp = Index(**dec) - self.assertTrue(i.equals(outp)) + tm.assert_index_equal(i, outp) self.assertTrue(i.name == outp.name) dec = _clean_dict(ujson.decode(ujson.encode(i, orient="split"), numpy=True)) outp = Index(**dec) - self.assertTrue(i.equals(outp)) + tm.assert_index_equal(i, outp) self.assertTrue(i.name == outp.name) - outp = Index(ujson.decode(ujson.encode(i, orient="values"))) - self.assertTrue(i.equals(outp)) + outp = Index(ujson.decode(ujson.encode(i, orient="values")), + name='index') + tm.assert_index_equal(i, outp) - outp = Index(ujson.decode(ujson.encode( - i, orient="values"), numpy=True)) - self.assertTrue(i.equals(outp)) + outp = Index(ujson.decode(ujson.encode(i, orient="values"), + numpy=True), name='index') + tm.assert_index_equal(i, outp) - outp = Index(ujson.decode(ujson.encode(i, orient="records"))) - self.assertTrue(i.equals(outp)) + outp = Index(ujson.decode(ujson.encode(i, orient="records")), + name='index') + tm.assert_index_equal(i, outp) - outp = Index(ujson.decode(ujson.encode( - i, orient="records"), numpy=True)) - self.assertTrue(i.equals(outp)) + outp = Index(ujson.decode(ujson.encode(i, orient="records"), + numpy=True), name='index') + tm.assert_index_equal(i, outp) - outp = Index(ujson.decode(ujson.encode(i, orient="index"))) - self.assertTrue(i.equals(outp)) + outp = Index(ujson.decode(ujson.encode(i, orient="index")), + name='index') + tm.assert_index_equal(i, outp) - outp = Index(ujson.decode(ujson.encode(i, orient="index"), numpy=True)) - self.assertTrue(i.equals(outp)) + outp = Index(ujson.decode(ujson.encode(i, orient="index"), + numpy=True), name='index') + tm.assert_index_equal(i, outp) def test_datetimeindex(self): from pandas.tseries.index import date_range @@ -1425,7 +1427,7 @@ def test_datetimeindex(self): encoded = ujson.encode(rng, date_unit='ns') decoded = DatetimeIndex(np.array(ujson.decode(encoded))) - self.assertTrue(rng.equals(decoded)) + tm.assert_index_equal(rng, decoded) ts = Series(np.random.randn(len(rng)), index=rng) decoded = Series(ujson.decode(ujson.encode(ts, date_unit='ns'))) diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 24c670abe8158..b7ef754004e18 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -61,12 +61,6 @@ def test_delim_whitespace_custom_terminator(self): columns=['a', 'b', 'c']) tm.assert_frame_equal(df, expected) - def test_parse_dates_empty_string(self): - # see gh-2263 - s = StringIO("Date, test\n2012-01-01, 1\n,2") - result = self.read_csv(s, parse_dates=["Date"], na_filter=False) - self.assertTrue(result['Date'].isnull()[1]) - def test_dtype_and_names_error(self): # see gh-8833: passing both dtype and names # resulting in an error reporting issue @@ -178,28 +172,8 @@ def error(val): self.assertTrue(sum(precise_errors) <= sum(normal_errors)) self.assertTrue(max(precise_errors) <= max(normal_errors)) - def test_compact_ints(self): - if compat.is_platform_windows() and not self.low_memory: - raise nose.SkipTest( - "segfaults on win-64, only when all tests are run") - - data = ('0,1,0,0\n' - '1,1,0,0\n' - '0,1,0,1') - - result = self.read_csv(StringIO(data), delimiter=',', header=None, - compact_ints=True, as_recarray=True) - ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) - - result = self.read_csv(StringIO(data), delimiter=',', header=None, - as_recarray=True, compact_ints=True, - use_unsigned=True) - ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) - def test_compact_ints_as_recarray(self): - if compat.is_platform_windows() and self.low_memory: + if compat.is_platform_windows(): raise nose.SkipTest( "segfaults on win-64, only when all tests are run") @@ -207,16 +181,20 @@ def test_compact_ints_as_recarray(self): '1,1,0,0\n' '0,1,0,1') - result = self.read_csv(StringIO(data), delimiter=',', header=None, - compact_ints=True, as_recarray=True) - ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) - - result = self.read_csv(StringIO(data), delimiter=',', header=None, - as_recarray=True, compact_ints=True, - use_unsigned=True) - ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + result = self.read_csv(StringIO(data), delimiter=',', header=None, + compact_ints=True, as_recarray=True) + ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) + self.assertEqual(result.dtype, ex_dtype) + + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + result = self.read_csv(StringIO(data), delimiter=',', header=None, + as_recarray=True, compact_ints=True, + use_unsigned=True) + ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) + self.assertEqual(result.dtype, ex_dtype) def test_pass_dtype(self): data = """\ @@ -293,23 +271,18 @@ def test_empty_with_mangled_column_pass_dtype_by_indexes(self): {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) tm.assert_frame_equal(result, expected, check_index_type=False) - def test_empty_with_dup_column_pass_dtype_by_names(self): - data = 'one,one' - result = self.read_csv( - StringIO(data), mangle_dupe_cols=False, dtype={'one': 'u1'}) - expected = pd.concat([Series([], name='one', dtype='u1')] * 2, axis=1) - tm.assert_frame_equal(result, expected, check_index_type=False) - def test_empty_with_dup_column_pass_dtype_by_indexes(self): - # FIXME in gh-9424 - raise nose.SkipTest( - "gh-9424; known failure read_csv with duplicate columns") + # see gh-9424 + expected = pd.concat([Series([], name='one', dtype='u1'), + Series([], name='one.1', dtype='f')], axis=1) data = 'one,one' - result = self.read_csv( - StringIO(data), mangle_dupe_cols=False, dtype={0: 'u1', 1: 'f'}) - expected = pd.concat([Series([], name='one', dtype='u1'), - Series([], name='one', dtype='f')], axis=1) + result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) + tm.assert_frame_equal(result, expected, check_index_type=False) + + data = '' + result = self.read_csv(StringIO(data), names=['one', 'one'], + dtype={0: 'u1', 1: 'f'}) tm.assert_frame_equal(result, expected, check_index_type=False) def test_usecols_dtypes(self): @@ -353,17 +326,6 @@ def test_disable_bool_parsing(self): result = self.read_csv(StringIO(data), dtype=object, na_filter=False) self.assertEqual(result['B'][2], '') - def test_euro_decimal_format(self): - data = """Id;Number1;Number2;Text1;Text2;Number3 -1;1521,1541;187101,9543;ABC;poi;4,738797819 -2;121,12;14897,76;DEF;uyt;0,377320872 -3;878,158;108013,434;GHI;rez;2,735694704""" - - df2 = self.read_csv(StringIO(data), sep=';', decimal=',') - self.assertEqual(df2['Number1'].dtype, float) - self.assertEqual(df2['Number2'].dtype, float) - self.assertEqual(df2['Number3'].dtype, float) - def test_custom_lineterminator(self): data = 'a,b,c~1,2,3~4,5,6' @@ -382,15 +344,6 @@ def test_raise_on_passed_int_dtype_with_nas(self): sep=",", skipinitialspace=True, dtype={'DOY': np.int64}) - def test_na_trailing_columns(self): - data = """Date,Currenncy,Symbol,Type,Units,UnitPrice,Cost,Tax -2012-03-14,USD,AAPL,BUY,1000 -2012-05-12,USD,SBUX,SELL,500""" - - result = self.read_csv(StringIO(data)) - self.assertEqual(result['Date'][1], '2012-05-12') - self.assertTrue(result['UnitPrice'].isnull().all()) - def test_parse_ragged_csv(self): data = """1,2,3 1,2,3,4 @@ -435,49 +388,6 @@ def test_tokenize_CR_with_quoting(self): expected = self.read_csv(StringIO(data.replace('\r', '\n'))) tm.assert_frame_equal(result, expected) - def test_raise_on_no_columns(self): - # single newline - data = "\n" - self.assertRaises(ValueError, self.read_csv, StringIO(data)) - - # test with more than a single newline - data = "\n\n\n" - self.assertRaises(ValueError, self.read_csv, StringIO(data)) - - def test_1000_sep_with_decimal(self): - data = """A|B|C -1|2,334.01|5 -10|13|10. -""" - expected = DataFrame({ - 'A': [1, 10], - 'B': [2334.01, 13], - 'C': [5, 10.] - }) - - tm.assert_equal(expected.A.dtype, 'int64') - tm.assert_equal(expected.B.dtype, 'float') - tm.assert_equal(expected.C.dtype, 'float') - - df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.') - tm.assert_frame_equal(df, expected) - - df = self.read_table(StringIO(data), sep='|', - thousands=',', decimal='.') - tm.assert_frame_equal(df, expected) - - data_with_odd_sep = """A|B|C -1|2.334,01|5 -10|13|10, -""" - df = self.read_csv(StringIO(data_with_odd_sep), - sep='|', thousands='.', decimal=',') - tm.assert_frame_equal(df, expected) - - df = self.read_table(StringIO(data_with_odd_sep), - sep='|', thousands='.', decimal=',') - tm.assert_frame_equal(df, expected) - def test_grow_boundary_at_cap(self): # See gh-12494 # @@ -497,25 +407,3 @@ def test_empty_header_read(count): for count in range(1, 101): test_empty_header_read(count) - - def test_inf_parsing(self): - data = """\ -,A -a,inf -b,-inf -c,Inf -d,-Inf -e,INF -f,-INF -g,INf -h,-INf -i,inF -j,-inF""" - inf = float('inf') - expected = Series([inf, -inf] * 5) - - df = self.read_csv(StringIO(data), index_col=0) - tm.assert_almost_equal(df['A'].values, expected.values) - - df = self.read_csv(StringIO(data), index_col=0, na_filter=False) - tm.assert_almost_equal(df['A'].values, expected.values) diff --git a/pandas/io/tests/parser/comment.py b/pandas/io/tests/parser/comment.py index 07fc6a167a6c0..f7cd1e190ec16 100644 --- a/pandas/io/tests/parser/comment.py +++ b/pandas/io/tests/parser/comment.py @@ -19,14 +19,14 @@ def test_comment(self): 1,2.,4.#hello world 5.,NaN,10.0 """ - expected = [[1., 2., 4.], - [5., np.nan, 10.]] + expected = np.array([[1., 2., 4.], + [5., np.nan, 10.]]) df = self.read_csv(StringIO(data), comment='#') - tm.assert_almost_equal(df.values, expected) + tm.assert_numpy_array_equal(df.values, expected) df = self.read_table(StringIO(data), sep=',', comment='#', na_values=['NaN']) - tm.assert_almost_equal(df.values, expected) + tm.assert_numpy_array_equal(df.values, expected) def test_line_comment(self): data = """# empty @@ -35,10 +35,10 @@ def test_line_comment(self): #ignore this line 5.,NaN,10.0 """ - expected = [[1., 2., 4.], - [5., np.nan, 10.]] + expected = np.array([[1., 2., 4.], + [5., np.nan, 10.]]) df = self.read_csv(StringIO(data), comment='#') - tm.assert_almost_equal(df.values, expected) + tm.assert_numpy_array_equal(df.values, expected) # check with delim_whitespace=True df = self.read_csv(StringIO(data.replace(',', ' ')), comment='#', @@ -48,11 +48,11 @@ def test_line_comment(self): # custom line terminator is not supported # with the Python parser yet if self.engine == 'c': - expected = [[1., 2., 4.], - [5., np.nan, 10.]] + expected = np.array([[1., 2., 4.], + [5., np.nan, 10.]]) df = self.read_csv(StringIO(data.replace('\n', '*')), comment='#', lineterminator='*') - tm.assert_almost_equal(df.values, expected) + tm.assert_numpy_array_equal(df.values, expected) def test_comment_skiprows(self): data = """# empty @@ -64,9 +64,9 @@ def test_comment_skiprows(self): 5.,NaN,10.0 """ # this should ignore the first four lines (including comments) - expected = [[1., 2., 4.], [5., np.nan, 10.]] + expected = np.array([[1., 2., 4.], [5., np.nan, 10.]]) df = self.read_csv(StringIO(data), comment='#', skiprows=4) - tm.assert_almost_equal(df.values, expected) + tm.assert_numpy_array_equal(df.values, expected) def test_comment_header(self): data = """# empty @@ -77,9 +77,9 @@ def test_comment_header(self): 5.,NaN,10.0 """ # header should begin at the second non-comment line - expected = [[1., 2., 4.], [5., np.nan, 10.]] + expected = np.array([[1., 2., 4.], [5., np.nan, 10.]]) df = self.read_csv(StringIO(data), comment='#', header=1) - tm.assert_almost_equal(df.values, expected) + tm.assert_numpy_array_equal(df.values, expected) def test_comment_skiprows_header(self): data = """# empty @@ -94,9 +94,9 @@ def test_comment_skiprows_header(self): # skiprows should skip the first 4 lines (including comments), while # header should start from the second non-commented line starting # with line 5 - expected = [[1., 2., 4.], [5., np.nan, 10.]] + expected = np.array([[1., 2., 4.], [5., np.nan, 10.]]) df = self.read_csv(StringIO(data), comment='#', skiprows=4, header=1) - tm.assert_almost_equal(df.values, expected) + tm.assert_numpy_array_equal(df.values, expected) def test_custom_comment_char(self): data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo" diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 4d9ce922184d9..f8c7241fdf88a 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -10,7 +10,6 @@ import nose import numpy as np -from numpy.testing.decorators import slow from pandas.lib import Timestamp import pandas as pd @@ -41,10 +40,10 @@ def test_empty_decimal_marker(self): 1|2,334|5 10|13|10. """ - # C parser: supports only length-1 decimals - # Python parser: 'decimal' not supported yet - self.assertRaises(ValueError, self.read_csv, - StringIO(data), decimal='') + # Parsers support only length-1 decimals + msg = 'Only length-1 decimal markers supported' + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv(StringIO(data), decimal='') def test_read_csv(self): if not compat.PY3: @@ -233,16 +232,18 @@ def test_unnamed_columns(self): 6,7,8,9,10 11,12,13,14,15 """ - expected = [[1, 2, 3, 4, 5.], - [6, 7, 8, 9, 10], - [11, 12, 13, 14, 15]] + expected = np.array([[1, 2, 3, 4, 5], + [6, 7, 8, 9, 10], + [11, 12, 13, 14, 15]], dtype=np.int64) df = self.read_table(StringIO(data), sep=',') tm.assert_almost_equal(df.values, expected) - self.assert_numpy_array_equal(df.columns, - ['A', 'B', 'C', 'Unnamed: 3', - 'Unnamed: 4']) + self.assert_index_equal(df.columns, + Index(['A', 'B', 'C', 'Unnamed: 3', + 'Unnamed: 4'])) def test_duplicate_columns(self): + # TODO: add test for condition 'mangle_dupe_cols=False' + # once it is actually supported (gh-12935) data = """A,A,B,B,B 1,2,3,4,5 6,7,8,9,10 @@ -256,11 +257,6 @@ def test_duplicate_columns(self): self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2']) - df = getattr(self, method)(StringIO(data), sep=',', - mangle_dupe_cols=False) - self.assertEqual(list(df.columns), - ['A', 'A', 'B', 'B', 'B']) - df = getattr(self, method)(StringIO(data), sep=',', mangle_dupe_cols=True) self.assertEqual(list(df.columns), @@ -279,7 +275,7 @@ def test_read_csv_dataframe(self): df = self.read_csv(self.csv1, index_col=0, parse_dates=True) df2 = self.read_table(self.csv1, sep=',', index_col=0, parse_dates=True) - self.assert_numpy_array_equal(df.columns, ['A', 'B', 'C', 'D']) + self.assert_index_equal(df.columns, pd.Index(['A', 'B', 'C', 'D'])) self.assertEqual(df.index.name, 'index') self.assertIsInstance( df.index[0], (datetime, np.datetime64, Timestamp)) @@ -290,12 +286,12 @@ def test_read_csv_no_index_name(self): df = self.read_csv(self.csv2, index_col=0, parse_dates=True) df2 = self.read_table(self.csv2, sep=',', index_col=0, parse_dates=True) - self.assert_numpy_array_equal(df.columns, ['A', 'B', 'C', 'D', 'E']) - self.assertIsInstance( - df.index[0], (datetime, np.datetime64, Timestamp)) - self.assertEqual(df.ix[ - :, ['A', 'B', 'C', 'D'] - ].values.dtype, np.float64) + self.assert_index_equal(df.columns, + pd.Index(['A', 'B', 'C', 'D', 'E'])) + self.assertIsInstance(df.index[0], + (datetime, np.datetime64, Timestamp)) + self.assertEqual(df.ix[:, ['A', 'B', 'C', 'D']].values.dtype, + np.float64) tm.assert_frame_equal(df, df2) def test_read_table_unicode(self): @@ -394,10 +390,23 @@ def test_int_conversion(self): self.assertEqual(data['B'].dtype, np.int64) def test_read_nrows(self): - df = self.read_csv(StringIO(self.data1), nrows=3) expected = self.read_csv(StringIO(self.data1))[:3] + + df = self.read_csv(StringIO(self.data1), nrows=3) tm.assert_frame_equal(df, expected) + # see gh-10476 + df = self.read_csv(StringIO(self.data1), nrows=3.0) + tm.assert_frame_equal(df, expected) + + msg = "must be an integer" + + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv(StringIO(self.data1), nrows=1.2) + + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv(StringIO(self.data1), nrows='foo') + def test_read_chunksize(self): reader = self.read_csv(StringIO(self.data1), index_col=0, chunksize=2) df = self.read_csv(StringIO(self.data1), index_col=0) @@ -597,7 +606,7 @@ def test_url(self): tm.assert_frame_equal(url_table, local_table) # TODO: ftp testing - @slow + @tm.slow def test_file(self): # FILE @@ -818,11 +827,6 @@ def test_ignore_leading_whitespace(self): expected = DataFrame({'a': [1, 4, 7], 'b': [2, 5, 8], 'c': [3, 6, 9]}) tm.assert_frame_equal(result, expected) - def test_nrows_and_chunksize_raises_notimplemented(self): - data = 'a b c' - self.assertRaises(NotImplementedError, self.read_csv, StringIO(data), - nrows=10, chunksize=5) - def test_chunk_begins_with_newline_whitespace(self): # see gh-10022 data = '\n hello\nworld\n' @@ -1117,21 +1121,21 @@ def test_empty_lines(self): -70,.4,1 """ - expected = [[1., 2., 4.], - [5., np.nan, 10.], - [-70., .4, 1.]] + expected = np.array([[1., 2., 4.], + [5., np.nan, 10.], + [-70., .4, 1.]]) df = self.read_csv(StringIO(data)) - tm.assert_almost_equal(df.values, expected) + tm.assert_numpy_array_equal(df.values, expected) df = self.read_csv(StringIO(data.replace(',', ' ')), sep='\s+') - tm.assert_almost_equal(df.values, expected) - expected = [[1., 2., 4.], - [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan], - [5., np.nan, 10.], - [np.nan, np.nan, np.nan], - [-70., .4, 1.]] + tm.assert_numpy_array_equal(df.values, expected) + expected = np.array([[1., 2., 4.], + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], + [5., np.nan, 10.], + [np.nan, np.nan, np.nan], + [-70., .4, 1.]]) df = self.read_csv(StringIO(data), skip_blank_lines=False) - tm.assert_almost_equal(list(df.values), list(expected)) + tm.assert_numpy_array_equal(df.values, expected) def test_whitespace_lines(self): data = """ @@ -1142,10 +1146,10 @@ def test_whitespace_lines(self): \t 1,2.,4. 5.,NaN,10.0 """ - expected = [[1, 2., 4.], - [5., np.nan, 10.]] + expected = np.array([[1, 2., 4.], + [5., np.nan, 10.]]) df = self.read_csv(StringIO(data)) - tm.assert_almost_equal(df.values, expected) + tm.assert_numpy_array_equal(df.values, expected) def test_regex_separator(self): # see gh-6607 @@ -1236,3 +1240,136 @@ def test_iteration_open_handle(self): result = self.read_table(f, squeeze=True, header=None) expected = Series(['DDD', 'EEE', 'FFF', 'GGG'], name=0) tm.assert_series_equal(result, expected) + + def test_1000_sep_with_decimal(self): + data = """A|B|C +1|2,334.01|5 +10|13|10. +""" + expected = DataFrame({ + 'A': [1, 10], + 'B': [2334.01, 13], + 'C': [5, 10.] + }) + + tm.assert_equal(expected.A.dtype, 'int64') + tm.assert_equal(expected.B.dtype, 'float') + tm.assert_equal(expected.C.dtype, 'float') + + df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.') + tm.assert_frame_equal(df, expected) + + df = self.read_table(StringIO(data), sep='|', + thousands=',', decimal='.') + tm.assert_frame_equal(df, expected) + + data_with_odd_sep = """A|B|C +1|2.334,01|5 +10|13|10, +""" + df = self.read_csv(StringIO(data_with_odd_sep), + sep='|', thousands='.', decimal=',') + tm.assert_frame_equal(df, expected) + + df = self.read_table(StringIO(data_with_odd_sep), + sep='|', thousands='.', decimal=',') + tm.assert_frame_equal(df, expected) + + def test_euro_decimal_format(self): + data = """Id;Number1;Number2;Text1;Text2;Number3 +1;1521,1541;187101,9543;ABC;poi;4,738797819 +2;121,12;14897,76;DEF;uyt;0,377320872 +3;878,158;108013,434;GHI;rez;2,735694704""" + + df2 = self.read_csv(StringIO(data), sep=';', decimal=',') + self.assertEqual(df2['Number1'].dtype, float) + self.assertEqual(df2['Number2'].dtype, float) + self.assertEqual(df2['Number3'].dtype, float) + + def test_read_duplicate_names(self): + # See gh-7160 + data = "a,b,a\n0,1,2\n3,4,5" + df = self.read_csv(StringIO(data)) + expected = DataFrame([[0, 1, 2], [3, 4, 5]], + columns=['a', 'b', 'a.1']) + tm.assert_frame_equal(df, expected) + + data = "0,1,2\n3,4,5" + df = self.read_csv(StringIO(data), names=["a", "b", "a"]) + expected = DataFrame([[0, 1, 2], [3, 4, 5]], + columns=['a', 'b', 'a.1']) + tm.assert_frame_equal(df, expected) + + def test_inf_parsing(self): + data = """\ +,A +a,inf +b,-inf +c,+Inf +d,-Inf +e,INF +f,-INF +g,+INf +h,-INf +i,inF +j,-inF""" + inf = float('inf') + expected = Series([inf, -inf] * 5) + + df = self.read_csv(StringIO(data), index_col=0) + tm.assert_almost_equal(df['A'].values, expected.values) + + df = self.read_csv(StringIO(data), index_col=0, na_filter=False) + tm.assert_almost_equal(df['A'].values, expected.values) + + def test_raise_on_no_columns(self): + # single newline + data = "\n" + self.assertRaises(EmptyDataError, self.read_csv, StringIO(data)) + + # test with more than a single newline + data = "\n\n\n" + self.assertRaises(EmptyDataError, self.read_csv, StringIO(data)) + + def test_compact_ints_use_unsigned(self): + # see gh-13323 + data = 'a,b,c\n1,9,258' + + # sanity check + expected = DataFrame({ + 'a': np.array([1], dtype=np.int64), + 'b': np.array([9], dtype=np.int64), + 'c': np.array([258], dtype=np.int64), + }) + out = self.read_csv(StringIO(data)) + tm.assert_frame_equal(out, expected) + + expected = DataFrame({ + 'a': np.array([1], dtype=np.int8), + 'b': np.array([9], dtype=np.int8), + 'c': np.array([258], dtype=np.int16), + }) + + # default behaviour for 'use_unsigned' + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + out = self.read_csv(StringIO(data), compact_ints=True) + tm.assert_frame_equal(out, expected) + + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + out = self.read_csv(StringIO(data), compact_ints=True, + use_unsigned=False) + tm.assert_frame_equal(out, expected) + + expected = DataFrame({ + 'a': np.array([1], dtype=np.uint8), + 'b': np.array([9], dtype=np.uint8), + 'c': np.array([258], dtype=np.uint16), + }) + + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + out = self.read_csv(StringIO(data), compact_ints=True, + use_unsigned=True) + tm.assert_frame_equal(out, expected) diff --git a/pandas/io/tests/parser/header.py b/pandas/io/tests/parser/header.py index e3c408f0af907..ca148b373d659 100644 --- a/pandas/io/tests/parser/header.py +++ b/pandas/io/tests/parser/header.py @@ -43,14 +43,14 @@ def test_no_header_prefix(self): df_pref = self.read_table(StringIO(data), sep=',', prefix='Field', header=None) - expected = [[1, 2, 3, 4, 5.], - [6, 7, 8, 9, 10], - [11, 12, 13, 14, 15]] + expected = np.array([[1, 2, 3, 4, 5], + [6, 7, 8, 9, 10], + [11, 12, 13, 14, 15]], dtype=np.int64) tm.assert_almost_equal(df_pref.values, expected) - self.assert_numpy_array_equal( - df_pref.columns, ['Field0', 'Field1', 'Field2', - 'Field3', 'Field4']) + self.assert_index_equal(df_pref.columns, + Index(['Field0', 'Field1', 'Field2', + 'Field3', 'Field4'])) def test_header_with_index_col(self): data = """foo,1,2,3 @@ -262,14 +262,14 @@ def test_no_header(self): names = ['foo', 'bar', 'baz', 'quux', 'panda'] df2 = self.read_table(StringIO(data), sep=',', names=names) - expected = [[1, 2, 3, 4, 5.], - [6, 7, 8, 9, 10], - [11, 12, 13, 14, 15]] + expected = np.array([[1, 2, 3, 4, 5], + [6, 7, 8, 9, 10], + [11, 12, 13, 14, 15]], dtype=np.int64) tm.assert_almost_equal(df.values, expected) tm.assert_almost_equal(df.values, df2.values) - self.assert_numpy_array_equal(df_pref.columns, - ['X0', 'X1', 'X2', 'X3', 'X4']) - self.assert_numpy_array_equal(df.columns, lrange(5)) + self.assert_index_equal(df_pref.columns, + Index(['X0', 'X1', 'X2', 'X3', 'X4'])) + self.assert_index_equal(df.columns, Index(lrange(5))) - self.assert_numpy_array_equal(df2.columns, names) + self.assert_index_equal(df2.columns, Index(names)) diff --git a/pandas/io/tests/parser/na_values.py b/pandas/io/tests/parser/na_values.py index 853e6242751c9..2a8c934abce61 100644 --- a/pandas/io/tests/parser/na_values.py +++ b/pandas/io/tests/parser/na_values.py @@ -11,7 +11,7 @@ import pandas.io.parsers as parsers import pandas.util.testing as tm -from pandas import DataFrame, MultiIndex, read_csv +from pandas import DataFrame, MultiIndex from pandas.compat import StringIO, range @@ -37,62 +37,36 @@ def test_detect_string_na(self): NA,baz NaN,nan """ - expected = [['foo', 'bar'], [nan, 'baz'], [nan, nan]] + expected = np.array([['foo', 'bar'], [nan, 'baz'], [nan, nan]], + dtype=np.object_) df = self.read_csv(StringIO(data)) - tm.assert_almost_equal(df.values, expected) + tm.assert_numpy_array_equal(df.values, expected) def test_non_string_na_values(self): - # see gh-3611, na_values that are not a string are an issue - with tm.ensure_clean('__non_string_na_values__.csv') as path: - df = DataFrame({'A': [-999, 2, 3], 'B': [1.2, -999, 4.5]}) - df.to_csv(path, sep=' ', index=False) - result1 = self.read_csv(path, sep=' ', header=0, - na_values=['-999.0', '-999']) - result2 = self.read_csv(path, sep=' ', header=0, - na_values=[-999, -999.0]) - result3 = self.read_csv(path, sep=' ', header=0, - na_values=[-999.0, -999]) - tm.assert_frame_equal(result1, result2) - tm.assert_frame_equal(result2, result3) - - result4 = self.read_csv( - path, sep=' ', header=0, na_values=['-999.0']) - result5 = self.read_csv( - path, sep=' ', header=0, na_values=['-999']) - result6 = self.read_csv( - path, sep=' ', header=0, na_values=[-999.0]) - result7 = self.read_csv( - path, sep=' ', header=0, na_values=[-999]) - tm.assert_frame_equal(result4, result3) - tm.assert_frame_equal(result5, result3) - tm.assert_frame_equal(result6, result3) - tm.assert_frame_equal(result7, result3) - - good_compare = result3 - - # with an odd float format, so we can't match the string 999.0 - # exactly, but need float matching - # TODO: change these to self.read_csv when Python bug is squashed - df.to_csv(path, sep=' ', index=False, float_format='%.3f') - result1 = read_csv(path, sep=' ', header=0, - na_values=['-999.0', '-999']) - result2 = read_csv(path, sep=' ', header=0, - na_values=[-999.0, -999]) - tm.assert_frame_equal(result1, good_compare) - tm.assert_frame_equal(result2, good_compare) - - result3 = read_csv(path, sep=' ', - header=0, na_values=['-999.0']) - result4 = read_csv(path, sep=' ', - header=0, na_values=['-999']) - result5 = read_csv(path, sep=' ', - header=0, na_values=[-999.0]) - result6 = read_csv(path, sep=' ', - header=0, na_values=[-999]) - tm.assert_frame_equal(result3, good_compare) - tm.assert_frame_equal(result4, good_compare) - tm.assert_frame_equal(result5, good_compare) - tm.assert_frame_equal(result6, good_compare) + # see gh-3611: with an odd float format, we can't match + # the string '999.0' exactly but still need float matching + nice = """A,B +-999,1.2 +2,-999 +3,4.5 +""" + ugly = """A,B +-999,1.200 +2,-999.000 +3,4.500 +""" + na_values_param = [['-999.0', '-999'], + [-999, -999.0], + [-999.0, -999], + ['-999.0'], ['-999'], + [-999.0], [-999]] + expected = DataFrame([[np.nan, 1.2], [2.0, np.nan], + [3.0, 4.5]], columns=['A', 'B']) + + for data in (nice, ugly): + for na_values in na_values_param: + out = self.read_csv(StringIO(data), na_values=na_values) + tm.assert_frame_equal(out, expected) def test_default_na_values(self): _NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', @@ -126,20 +100,20 @@ def test_custom_na_values(self): -1.#IND,5,baz 7,8,NaN """ - expected = [[1., nan, 3], - [nan, 5, nan], - [7, 8, nan]] + expected = np.array([[1., nan, 3], + [nan, 5, nan], + [7, 8, nan]]) df = self.read_csv(StringIO(data), na_values=['baz'], skiprows=[1]) - tm.assert_almost_equal(df.values, expected) + tm.assert_numpy_array_equal(df.values, expected) df2 = self.read_table(StringIO(data), sep=',', na_values=['baz'], skiprows=[1]) - tm.assert_almost_equal(df2.values, expected) + tm.assert_numpy_array_equal(df2.values, expected) df3 = self.read_table(StringIO(data), sep=',', na_values='baz', skiprows=[1]) - tm.assert_almost_equal(df3.values, expected) + tm.assert_numpy_array_equal(df3.values, expected) def test_bool_na_values(self): data = """A,B,C @@ -250,116 +224,29 @@ def test_na_values_keep_default(self): 'seven']}) tm.assert_frame_equal(xp.reindex(columns=df.columns), df) - def test_skiprow_with_newline(self): - # see gh-12775 and gh-10911 - data = """id,text,num_lines -1,"line 11 -line 12",2 -2,"line 21 -line 22",2 -3,"line 31",1""" - expected = [[2, 'line 21\nline 22', 2], - [3, 'line 31', 1]] - expected = DataFrame(expected, columns=[ - 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(df, expected) - - data = ('a,b,c\n~a\n b~,~e\n d~,' - '~f\n f~\n1,2,~12\n 13\n 14~') - expected = [['a\n b', 'e\n d', 'f\n f']] - expected = DataFrame(expected, columns=[ - 'a', 'b', 'c']) - df = self.read_csv(StringIO(data), - quotechar="~", - skiprows=[2]) - tm.assert_frame_equal(df, expected) - - data = ('Text,url\n~example\n ' - 'sentence\n one~,url1\n~' - 'example\n sentence\n two~,url2\n~' - 'example\n sentence\n three~,url3') - expected = [['example\n sentence\n two', 'url2']] - expected = DataFrame(expected, columns=[ - 'Text', 'url']) - df = self.read_csv(StringIO(data), - quotechar="~", - skiprows=[1, 3]) - tm.assert_frame_equal(df, expected) - - def test_skiprow_with_quote(self): - # see gh-12775 and gh-10911 - data = """id,text,num_lines -1,"line '11' line 12",2 -2,"line '21' line 22",2 -3,"line '31' line 32",1""" - expected = [[2, "line '21' line 22", 2], - [3, "line '31' line 32", 1]] - expected = DataFrame(expected, columns=[ - 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(df, expected) - - def test_skiprow_with_newline_and_quote(self): - # see gh-12775 and gh-10911 - data = """id,text,num_lines -1,"line \n'11' line 12",2 -2,"line \n'21' line 22",2 -3,"line \n'31' line 32",1""" - expected = [[2, "line \n'21' line 22", 2], - [3, "line \n'31' line 32", 1]] - expected = DataFrame(expected, columns=[ - 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(df, expected) - - data = """id,text,num_lines -1,"line '11\n' line 12",2 -2,"line '21\n' line 22",2 -3,"line '31\n' line 32",1""" - expected = [[2, "line '21\n' line 22", 2], - [3, "line '31\n' line 32", 1]] - expected = DataFrame(expected, columns=[ - 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(df, expected) + def test_na_values_na_filter_override(self): + data = """\ +A,B +1,A +nan,B +3,C +""" - data = """id,text,num_lines -1,"line '11\n' \r\tline 12",2 -2,"line '21\n' \r\tline 22",2 -3,"line '31\n' \r\tline 32",1""" - expected = [[2, "line '21\n' \r\tline 22", 2], - [3, "line '31\n' \r\tline 32", 1]] - expected = DataFrame(expected, columns=[ - 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(df, expected) + expected = DataFrame([[1, 'A'], [np.nan, np.nan], [3, 'C']], + columns=['A', 'B']) + out = self.read_csv(StringIO(data), na_values=['B'], na_filter=True) + tm.assert_frame_equal(out, expected) - def test_skiprows_lineterminator(self): - # see gh-9079 - data = '\n'.join(['SMOSMANIA ThetaProbe-ML2X ', - '2007/01/01 01:00 0.2140 U M ', - '2007/01/01 02:00 0.2141 M O ', - '2007/01/01 04:00 0.2142 D M ']) - expected = DataFrame([['2007/01/01', '01:00', 0.2140, 'U', 'M'], - ['2007/01/01', '02:00', 0.2141, 'M', 'O'], - ['2007/01/01', '04:00', 0.2142, 'D', 'M']], - columns=['date', 'time', 'var', 'flag', - 'oflag']) - - # test with default line terminators "LF" and "CRLF" - df = self.read_csv(StringIO(data), skiprows=1, delim_whitespace=True, - names=['date', 'time', 'var', 'flag', 'oflag']) - tm.assert_frame_equal(df, expected) + expected = DataFrame([['1', 'A'], ['nan', 'B'], ['3', 'C']], + columns=['A', 'B']) + out = self.read_csv(StringIO(data), na_values=['B'], na_filter=False) + tm.assert_frame_equal(out, expected) - df = self.read_csv(StringIO(data.replace('\n', '\r\n')), - skiprows=1, delim_whitespace=True, - names=['date', 'time', 'var', 'flag', 'oflag']) - tm.assert_frame_equal(df, expected) + def test_na_trailing_columns(self): + data = """Date,Currenncy,Symbol,Type,Units,UnitPrice,Cost,Tax +2012-03-14,USD,AAPL,BUY,1000 +2012-05-12,USD,SBUX,SELL,500""" - # "CR" is not respected with the Python parser yet - if self.engine == 'c': - df = self.read_csv(StringIO(data.replace('\n', '\r')), - skiprows=1, delim_whitespace=True, - names=['date', 'time', 'var', 'flag', 'oflag']) - tm.assert_frame_equal(df, expected) + result = self.read_csv(StringIO(data)) + self.assertEqual(result['Date'][1], '2012-05-12') + self.assertTrue(result['UnitPrice'].isnull().all()) diff --git a/pandas/io/tests/parser/parse_dates.py b/pandas/io/tests/parser/parse_dates.py index ec368bb358ad5..01816bde66120 100644 --- a/pandas/io/tests/parser/parse_dates.py +++ b/pandas/io/tests/parser/parse_dates.py @@ -467,3 +467,10 @@ def test_read_with_parse_dates_invalid_type(self): StringIO(data), parse_dates=np.array([4, 5])) tm.assertRaisesRegexp(TypeError, errmsg, self.read_csv, StringIO(data), parse_dates=set([1, 3, 3])) + + def test_parse_dates_empty_string(self): + # see gh-2263 + data = "Date, test\n2012-01-01, 1\n,2" + result = self.read_csv(StringIO(data), parse_dates=["Date"], + na_filter=False) + self.assertTrue(result['Date'].isnull()[1]) diff --git a/pandas/io/tests/parser/python_parser_only.py b/pandas/io/tests/parser/python_parser_only.py index 7d1793c429f4e..a08cb36c13f80 100644 --- a/pandas/io/tests/parser/python_parser_only.py +++ b/pandas/io/tests/parser/python_parser_only.py @@ -40,7 +40,8 @@ def test_sniff_delimiter(self): baz|7|8|9 """ data = self.read_csv(StringIO(text), index_col=0, sep=None) - self.assertTrue(data.index.equals(Index(['foo', 'bar', 'baz']))) + self.assert_index_equal(data.index, + Index(['foo', 'bar', 'baz'], name='index')) data2 = self.read_csv(StringIO(text), index_col=0, delimiter='|') tm.assert_frame_equal(data, data2) diff --git a/pandas/io/tests/parser/skiprows.py b/pandas/io/tests/parser/skiprows.py index 3e585a9a623c9..c9f50dec6c01e 100644 --- a/pandas/io/tests/parser/skiprows.py +++ b/pandas/io/tests/parser/skiprows.py @@ -76,3 +76,117 @@ def test_skiprows_blank(self): datetime(2000, 1, 3)]) expected.index.name = 0 tm.assert_frame_equal(data, expected) + + def test_skiprow_with_newline(self): + # see gh-12775 and gh-10911 + data = """id,text,num_lines +1,"line 11 +line 12",2 +2,"line 21 +line 22",2 +3,"line 31",1""" + expected = [[2, 'line 21\nline 22', 2], + [3, 'line 31', 1]] + expected = DataFrame(expected, columns=[ + 'id', 'text', 'num_lines']) + df = self.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(df, expected) + + data = ('a,b,c\n~a\n b~,~e\n d~,' + '~f\n f~\n1,2,~12\n 13\n 14~') + expected = [['a\n b', 'e\n d', 'f\n f']] + expected = DataFrame(expected, columns=[ + 'a', 'b', 'c']) + df = self.read_csv(StringIO(data), + quotechar="~", + skiprows=[2]) + tm.assert_frame_equal(df, expected) + + data = ('Text,url\n~example\n ' + 'sentence\n one~,url1\n~' + 'example\n sentence\n two~,url2\n~' + 'example\n sentence\n three~,url3') + expected = [['example\n sentence\n two', 'url2']] + expected = DataFrame(expected, columns=[ + 'Text', 'url']) + df = self.read_csv(StringIO(data), + quotechar="~", + skiprows=[1, 3]) + tm.assert_frame_equal(df, expected) + + def test_skiprow_with_quote(self): + # see gh-12775 and gh-10911 + data = """id,text,num_lines +1,"line '11' line 12",2 +2,"line '21' line 22",2 +3,"line '31' line 32",1""" + expected = [[2, "line '21' line 22", 2], + [3, "line '31' line 32", 1]] + expected = DataFrame(expected, columns=[ + 'id', 'text', 'num_lines']) + df = self.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(df, expected) + + def test_skiprow_with_newline_and_quote(self): + # see gh-12775 and gh-10911 + data = """id,text,num_lines +1,"line \n'11' line 12",2 +2,"line \n'21' line 22",2 +3,"line \n'31' line 32",1""" + expected = [[2, "line \n'21' line 22", 2], + [3, "line \n'31' line 32", 1]] + expected = DataFrame(expected, columns=[ + 'id', 'text', 'num_lines']) + df = self.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(df, expected) + + data = """id,text,num_lines +1,"line '11\n' line 12",2 +2,"line '21\n' line 22",2 +3,"line '31\n' line 32",1""" + expected = [[2, "line '21\n' line 22", 2], + [3, "line '31\n' line 32", 1]] + expected = DataFrame(expected, columns=[ + 'id', 'text', 'num_lines']) + df = self.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(df, expected) + + data = """id,text,num_lines +1,"line '11\n' \r\tline 12",2 +2,"line '21\n' \r\tline 22",2 +3,"line '31\n' \r\tline 32",1""" + expected = [[2, "line '21\n' \r\tline 22", 2], + [3, "line '31\n' \r\tline 32", 1]] + expected = DataFrame(expected, columns=[ + 'id', 'text', 'num_lines']) + df = self.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(df, expected) + + def test_skiprows_lineterminator(self): + # see gh-9079 + data = '\n'.join(['SMOSMANIA ThetaProbe-ML2X ', + '2007/01/01 01:00 0.2140 U M ', + '2007/01/01 02:00 0.2141 M O ', + '2007/01/01 04:00 0.2142 D M ']) + expected = DataFrame([['2007/01/01', '01:00', 0.2140, 'U', 'M'], + ['2007/01/01', '02:00', 0.2141, 'M', 'O'], + ['2007/01/01', '04:00', 0.2142, 'D', 'M']], + columns=['date', 'time', 'var', 'flag', + 'oflag']) + + # test with default line terminators "LF" and "CRLF" + df = self.read_csv(StringIO(data), skiprows=1, delim_whitespace=True, + names=['date', 'time', 'var', 'flag', 'oflag']) + tm.assert_frame_equal(df, expected) + + df = self.read_csv(StringIO(data.replace('\n', '\r\n')), + skiprows=1, delim_whitespace=True, + names=['date', 'time', 'var', 'flag', 'oflag']) + tm.assert_frame_equal(df, expected) + + # "CR" is not respected with the Python parser yet + if self.engine == 'c': + df = self.read_csv(StringIO(data.replace('\n', '\r')), + skiprows=1, delim_whitespace=True, + names=['date', 'time', 'var', 'flag', 'oflag']) + tm.assert_frame_equal(df, expected) diff --git a/pandas/io/tests/parser/test_parsers.py b/pandas/io/tests/parser/test_parsers.py index 374485b5ddaad..fda7b28769647 100644 --- a/pandas/io/tests/parser/test_parsers.py +++ b/pandas/io/tests/parser/test_parsers.py @@ -72,25 +72,16 @@ def read_csv(self, *args, **kwds): kwds = kwds.copy() kwds['engine'] = self.engine kwds['low_memory'] = self.low_memory - kwds['buffer_lines'] = 2 return read_csv(*args, **kwds) def read_table(self, *args, **kwds): kwds = kwds.copy() kwds['engine'] = self.engine kwds['low_memory'] = True - kwds['buffer_lines'] = 2 return read_table(*args, **kwds) class TestPythonParser(BaseParser, PythonParserTests, tm.TestCase): - """ - Class for Python parser testing. Unless specifically stated - as a PythonParser-specific issue, the goal is to eventually move - as many of these tests into ParserTests as soon as the C parser - can accept further specific arguments when parsing. - """ - engine = 'python' float_precision_choices = [None] diff --git a/pandas/io/tests/parser/test_read_fwf.py b/pandas/io/tests/parser/test_read_fwf.py index 5599188400368..11b10211650d6 100644 --- a/pandas/io/tests/parser/test_read_fwf.py +++ b/pandas/io/tests/parser/test_read_fwf.py @@ -217,8 +217,8 @@ def test_comment_fwf(self): 1 2. 4 #hello world 5 NaN 10.0 """ - expected = [[1, 2., 4], - [5, np.nan, 10.]] + expected = np.array([[1, 2., 4], + [5, np.nan, 10.]]) df = read_fwf(StringIO(data), colspecs=[(0, 3), (4, 9), (9, 25)], comment='#') tm.assert_almost_equal(df.values, expected) @@ -228,8 +228,8 @@ def test_1000_fwf(self): 1 2,334.0 5 10 13 10. """ - expected = [[1, 2334., 5], - [10, 13, 10]] + expected = np.array([[1, 2334., 5], + [10, 13, 10]]) df = read_fwf(StringIO(data), colspecs=[(0, 3), (3, 11), (12, 16)], thousands=',') tm.assert_almost_equal(df.values, expected) diff --git a/pandas/io/tests/parser/test_textreader.py b/pandas/io/tests/parser/test_textreader.py index f3de604f1ec48..c35cfca7012d3 100644 --- a/pandas/io/tests/parser/test_textreader.py +++ b/pandas/io/tests/parser/test_textreader.py @@ -76,8 +76,12 @@ def test_skipinitialspace(self): header=None) result = reader.read() - self.assert_numpy_array_equal(result[0], ['a', 'a', 'a', 'a']) - self.assert_numpy_array_equal(result[1], ['b', 'b', 'b', 'b']) + self.assert_numpy_array_equal(result[0], + np.array(['a', 'a', 'a', 'a'], + dtype=np.object_)) + self.assert_numpy_array_equal(result[1], + np.array(['b', 'b', 'b', 'b'], + dtype=np.object_)) def test_parse_booleans(self): data = 'True\nFalse\nTrue\nTrue' @@ -94,8 +98,10 @@ def test_delimit_whitespace(self): header=None) result = reader.read() - self.assert_numpy_array_equal(result[0], ['a', 'a', 'a']) - self.assert_numpy_array_equal(result[1], ['b', 'b', 'b']) + self.assert_numpy_array_equal(result[0], np.array(['a', 'a', 'a'], + dtype=np.object_)) + self.assert_numpy_array_equal(result[1], np.array(['b', 'b', 'b'], + dtype=np.object_)) def test_embedded_newline(self): data = 'a\n"hello\nthere"\nthis' @@ -103,7 +109,7 @@ def test_embedded_newline(self): reader = TextReader(StringIO(data), header=None) result = reader.read() - expected = ['a', 'hello\nthere', 'this'] + expected = np.array(['a', 'hello\nthere', 'this'], dtype=np.object_) self.assert_numpy_array_equal(result[0], expected) def test_euro_decimal(self): @@ -113,7 +119,7 @@ def test_euro_decimal(self): decimal=',', header=None) result = reader.read() - expected = [12345.67, 345.678] + expected = np.array([12345.67, 345.678]) tm.assert_almost_equal(result[0], expected) def test_integer_thousands(self): @@ -123,7 +129,7 @@ def test_integer_thousands(self): thousands=',', header=None) result = reader.read() - expected = [123456, 12500] + expected = np.array([123456, 12500], dtype=np.int64) tm.assert_almost_equal(result[0], expected) def test_integer_thousands_alt(self): diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py index 1813a95d7a306..97862ffa90cef 100644 --- a/pandas/io/tests/parser/test_unsupported.py +++ b/pandas/io/tests/parser/test_unsupported.py @@ -20,6 +20,25 @@ class TestUnsupportedFeatures(tm.TestCase): + def test_mangle_dupe_cols_false(self): + # see gh-12935 + data = 'a b c\n1 2 3' + msg = 'is not supported' + + for engine in ('c', 'python'): + with tm.assertRaisesRegexp(ValueError, msg): + read_csv(StringIO(data), engine=engine, + mangle_dupe_cols=False) + + def test_nrows_and_chunksize(self): + data = 'a b c' + msg = "cannot be used together yet" + + for engine in ('c', 'python'): + with tm.assertRaisesRegexp(NotImplementedError, msg): + read_csv(StringIO(data), engine=engine, + nrows=10, chunksize=5) + def test_c_engine(self): # see gh-6607 data = 'a b c\n1 2 3' @@ -98,6 +117,32 @@ def test_python_engine(self): with tm.assertRaisesRegexp(ValueError, msg): read_csv(StringIO(data), engine=engine, **kwargs) + +class TestDeprecatedFeatures(tm.TestCase): + def test_deprecated_args(self): + data = '1,2,3' + + # deprecated arguments with non-default values + deprecated = { + 'buffer_lines': True, + 'compact_ints': True, + 'use_unsigned': True, + } + + engines = 'c', 'python' + + for engine in engines: + for arg, non_default_val in deprecated.items(): + if engine == 'python' and arg == 'buffer_lines': + # unsupported --> exception is raised first + continue + + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + kwargs = {arg: non_default_val} + read_csv(StringIO(data), engine=engine, + **kwargs) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/io/tests/parser/usecols.py b/pandas/io/tests/parser/usecols.py index 06275c168becd..0d3ae95f0d1d4 100644 --- a/pandas/io/tests/parser/usecols.py +++ b/pandas/io/tests/parser/usecols.py @@ -6,6 +6,7 @@ """ from datetime import datetime +import nose import pandas.util.testing as tm @@ -22,9 +23,8 @@ def test_raise_on_mixed_dtype_usecols(self): 1000,2000,3000 4000,5000,6000 """ - msg = ("The elements of \'usecols\' " - "must either be all strings " - "or all integers") + msg = ("The elements of 'usecols' must " + "either be all strings, all unicode, or all integers") usecols = [0, 'b', 2] with tm.assertRaisesRegexp(ValueError, msg): @@ -254,3 +254,103 @@ def test_usecols_with_parse_dates_and_usecol_names(self): usecols=[3, 0, 2], parse_dates=parse_dates) tm.assert_frame_equal(df, expected) + + def test_usecols_with_unicode_strings(self): + # see gh-13219 + + s = '''AAA,BBB,CCC,DDD + 0.056674973,8,True,a + 2.613230982,2,False,b + 3.568935038,7,False,a + ''' + + data = { + 'AAA': { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002 + }, + 'BBB': {0: 8, 1: 2, 2: 7} + } + expected = DataFrame(data) + + df = self.read_csv(StringIO(s), usecols=[u'AAA', u'BBB']) + tm.assert_frame_equal(df, expected) + + def test_usecols_with_single_byte_unicode_strings(self): + # see gh-13219 + + s = '''A,B,C,D + 0.056674973,8,True,a + 2.613230982,2,False,b + 3.568935038,7,False,a + ''' + + data = { + 'A': { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002 + }, + 'B': {0: 8, 1: 2, 2: 7} + } + expected = DataFrame(data) + + df = self.read_csv(StringIO(s), usecols=[u'A', u'B']) + tm.assert_frame_equal(df, expected) + + def test_usecols_with_mixed_encoding_strings(self): + s = '''AAA,BBB,CCC,DDD + 0.056674973,8,True,a + 2.613230982,2,False,b + 3.568935038,7,False,a + ''' + + msg = ("The elements of 'usecols' must " + "either be all strings, all unicode, or all integers") + + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv(StringIO(s), usecols=[u'AAA', b'BBB']) + + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv(StringIO(s), usecols=[b'AAA', u'BBB']) + + def test_usecols_with_multibyte_characters(self): + s = '''あああ,いい,ううう,ええええ + 0.056674973,8,True,a + 2.613230982,2,False,b + 3.568935038,7,False,a + ''' + data = { + 'あああ': { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002 + }, + 'いい': {0: 8, 1: 2, 2: 7} + } + expected = DataFrame(data) + + df = self.read_csv(StringIO(s), usecols=['あああ', 'いい']) + tm.assert_frame_equal(df, expected) + + def test_usecols_with_multibyte_unicode_characters(self): + raise nose.SkipTest('TODO: see gh-13253') + + s = '''あああ,いい,ううう,ええええ + 0.056674973,8,True,a + 2.613230982,2,False,b + 3.568935038,7,False,a + ''' + data = { + 'あああ': { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002 + }, + 'いい': {0: 8, 1: 2, 2: 7} + } + expected = DataFrame(data) + + df = self.read_csv(StringIO(s), usecols=[u'あああ', u'いい']) + tm.assert_frame_equal(df, expected) diff --git a/pandas/io/tests/test_data.py b/pandas/io/tests/test_data.py index d9c09fa788332..1efa8b13598a7 100644 --- a/pandas/io/tests/test_data.py +++ b/pandas/io/tests/test_data.py @@ -302,6 +302,8 @@ class TestYahooOptions(tm.TestCase): @classmethod def setUpClass(cls): super(TestYahooOptions, cls).setUpClass() + raise nose.SkipTest('disable Yahoo Options tests') + _skip_if_no_lxml() _skip_if_no_bs() raise nose.SkipTest('unreliable test') @@ -472,9 +474,6 @@ def test_options_source_warning(self): class TestDataReader(tm.TestCase): - def test_is_s3_url(self): - from pandas.io.common import _is_s3_url - self.assertTrue(_is_s3_url("s3://pandas/somethingelse.com")) @network def test_read_yahoo(self): @@ -503,6 +502,12 @@ def test_read_famafrench(self): class TestFred(tm.TestCase): + + @classmethod + def setUpClass(cls): + super(TestFred, cls).setUpClass() + raise nose.SkipTest('disable Fred tests') + @network def test_fred(self): raise nose.SkipTest('buggy as of 2/14/16; maybe a data revision?') diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index af053450d78c4..b7e5360a6f3db 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -13,7 +13,6 @@ from numpy import nan import numpy as np -from numpy.testing.decorators import slow import pandas as pd from pandas import DataFrame, Index, MultiIndex @@ -544,7 +543,7 @@ def test_read_from_s3_url(self): local_table = self.get_exceldf('test1') tm.assert_frame_equal(url_table, local_table) - @slow + @tm.slow def test_read_from_file_url(self): # FILE @@ -1102,9 +1101,9 @@ def test_sheets(self): tm.assert_frame_equal(self.frame, recons) recons = read_excel(reader, 'test2', index_col=0) tm.assert_frame_equal(self.tsframe, recons) - np.testing.assert_equal(2, len(reader.sheet_names)) - np.testing.assert_equal('test1', reader.sheet_names[0]) - np.testing.assert_equal('test2', reader.sheet_names[1]) + self.assertEqual(2, len(reader.sheet_names)) + self.assertEqual('test1', reader.sheet_names[0]) + self.assertEqual('test2', reader.sheet_names[1]) def test_colaliases(self): _skip_if_no_xlrd() diff --git a/pandas/io/tests/test_ga.py b/pandas/io/tests/test_ga.py index b8b698691a9f5..469e121f633d7 100644 --- a/pandas/io/tests/test_ga.py +++ b/pandas/io/tests/test_ga.py @@ -7,8 +7,8 @@ import nose import pandas as pd from pandas import compat -from pandas.util.testing import network, assert_frame_equal, with_connectivity_check -from numpy.testing.decorators import slow +from pandas.util.testing import (network, assert_frame_equal, + with_connectivity_check, slow) import pandas.util.testing as tm if compat.PY3: diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 21d0748fb6aba..5a95fe7727df0 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -16,7 +16,6 @@ import numpy as np from numpy.random import rand -from numpy.testing.decorators import slow from pandas import (DataFrame, MultiIndex, read_csv, Timestamp, Index, date_range, Series) @@ -129,7 +128,7 @@ def test_spam_url(self): assert_framelist_equal(df1, df2) - @slow + @tm.slow def test_banklist(self): df1 = self.read_html(self.banklist_data, '.*Florida.*', attrs={'id': 'table'}) @@ -289,9 +288,9 @@ def test_invalid_url(self): self.read_html('http://www.a23950sdfa908sd.com', match='.*Water.*') except ValueError as e: - tm.assert_equal(str(e), 'No tables found') + self.assertEqual(str(e), 'No tables found') - @slow + @tm.slow def test_file_url(self): url = self.banklist_data dfs = self.read_html(file_path_to_url(url), 'First', @@ -300,7 +299,7 @@ def test_file_url(self): for df in dfs: tm.assertIsInstance(df, DataFrame) - @slow + @tm.slow def test_invalid_table_attrs(self): url = self.banklist_data with tm.assertRaisesRegexp(ValueError, 'No tables found'): @@ -311,39 +310,39 @@ def _bank_data(self, *args, **kwargs): return self.read_html(self.banklist_data, 'Metcalf', attrs={'id': 'table'}, *args, **kwargs) - @slow + @tm.slow def test_multiindex_header(self): df = self._bank_data(header=[0, 1])[0] tm.assertIsInstance(df.columns, MultiIndex) - @slow + @tm.slow def test_multiindex_index(self): df = self._bank_data(index_col=[0, 1])[0] tm.assertIsInstance(df.index, MultiIndex) - @slow + @tm.slow def test_multiindex_header_index(self): df = self._bank_data(header=[0, 1], index_col=[0, 1])[0] tm.assertIsInstance(df.columns, MultiIndex) tm.assertIsInstance(df.index, MultiIndex) - @slow + @tm.slow def test_multiindex_header_skiprows_tuples(self): df = self._bank_data(header=[0, 1], skiprows=1, tupleize_cols=True)[0] tm.assertIsInstance(df.columns, Index) - @slow + @tm.slow def test_multiindex_header_skiprows(self): df = self._bank_data(header=[0, 1], skiprows=1)[0] tm.assertIsInstance(df.columns, MultiIndex) - @slow + @tm.slow def test_multiindex_header_index_skiprows(self): df = self._bank_data(header=[0, 1], index_col=[0, 1], skiprows=1)[0] tm.assertIsInstance(df.index, MultiIndex) tm.assertIsInstance(df.columns, MultiIndex) - @slow + @tm.slow def test_regex_idempotency(self): url = self.banklist_data dfs = self.read_html(file_path_to_url(url), @@ -371,7 +370,7 @@ def test_python_docs_table(self): zz = [df.iloc[0, 0][0:4] for df in dfs] self.assertEqual(sorted(zz), sorted(['Repo', 'What'])) - @slow + @tm.slow def test_thousands_macau_stats(self): all_non_nan_table_index = -2 macau_data = os.path.join(DATA_PATH, 'macau.html') @@ -381,7 +380,7 @@ def test_thousands_macau_stats(self): self.assertFalse(any(s.isnull().any() for _, s in df.iteritems())) - @slow + @tm.slow def test_thousands_macau_index_col(self): all_non_nan_table_index = -2 macau_data = os.path.join(DATA_PATH, 'macau.html') @@ -520,9 +519,9 @@ def test_nyse_wsj_commas_table(self): 'Volume', 'Price', 'Chg', '% Chg']) nrows = 100 self.assertEqual(df.shape[0], nrows) - self.assertTrue(df.columns.equals(columns)) + self.assert_index_equal(df.columns, columns) - @slow + @tm.slow def test_banklist_header(self): from pandas.io.html import _remove_whitespace @@ -561,7 +560,7 @@ def try_remove_ws(x): coerce=True) tm.assert_frame_equal(converted, gtnew) - @slow + @tm.slow def test_gold_canyon(self): gc = 'Gold Canyon' with open(self.banklist_data, 'r') as f: @@ -663,7 +662,31 @@ def test_wikipedia_states_table(self): assert os.path.isfile(data), '%r is not a file' % data assert os.path.getsize(data), '%r is an empty file' % data result = self.read_html(data, 'Arizona', header=1)[0] - nose.tools.assert_equal(result['sq mi'].dtype, np.dtype('float64')) + self.assertEqual(result['sq mi'].dtype, np.dtype('float64')) + + def test_decimal_rows(self): + + # GH 12907 + data = StringIO(''' + + + + + + + + + + + + +
Header
1100#101
+ + ''') + expected = DataFrame(data={'Header': 1100.101}, index=[0]) + result = self.read_html(data, decimal='#')[0] + nose.tools.assert_equal(result['Header'].dtype, np.dtype('float64')) + tm.assert_frame_equal(result, expected) def test_bool_header_arg(self): # GH 6114 @@ -753,7 +776,7 @@ def test_works_on_valid_markup(self): tm.assertIsInstance(dfs, list) tm.assertIsInstance(dfs[0], DataFrame) - @slow + @tm.slow def test_fallback_success(self): _skip_if_none_of(('bs4', 'html5lib')) banklist_data = os.path.join(DATA_PATH, 'banklist.html') @@ -796,7 +819,7 @@ def get_elements_from_file(url, element='table'): return soup.find_all(element) -@slow +@tm.slow def test_bs4_finds_tables(): filepath = os.path.join(DATA_PATH, "spam.html") with warnings.catch_warnings(): @@ -811,13 +834,13 @@ def get_lxml_elements(url, element): return doc.xpath('.//{0}'.format(element)) -@slow +@tm.slow def test_lxml_finds_tables(): filepath = os.path.join(DATA_PATH, "spam.html") assert get_lxml_elements(filepath, 'table') -@slow +@tm.slow def test_lxml_finds_tbody(): filepath = os.path.join(DATA_PATH, "spam.html") assert get_lxml_elements(filepath, 'tbody') diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index 7c61a6942e8e7..ad7d6c3c9f94f 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -150,7 +150,11 @@ def test_scalar_complex(self): def test_list_numpy_float(self): x = [np.float32(np.random.rand()) for i in range(5)] x_rec = self.encode_decode(x) - tm.assert_almost_equal(x, x_rec) + # current msgpack cannot distinguish list/tuple + tm.assert_almost_equal(tuple(x), x_rec) + + x_rec = self.encode_decode(tuple(x)) + tm.assert_almost_equal(tuple(x), x_rec) def test_list_numpy_float_complex(self): if not hasattr(np, 'complex128'): @@ -165,7 +169,11 @@ def test_list_numpy_float_complex(self): def test_list_float(self): x = [np.random.rand() for i in range(5)] x_rec = self.encode_decode(x) - tm.assert_almost_equal(x, x_rec) + # current msgpack cannot distinguish list/tuple + tm.assert_almost_equal(tuple(x), x_rec) + + x_rec = self.encode_decode(tuple(x)) + tm.assert_almost_equal(tuple(x), x_rec) def test_list_float_complex(self): x = [np.random.rand() for i in range(5)] + \ @@ -217,7 +225,11 @@ def test_numpy_array_complex(self): def test_list_mixed(self): x = [1.0, np.float32(3.5), np.complex128(4.25), u('foo')] x_rec = self.encode_decode(x) - tm.assert_almost_equal(x, x_rec) + # current msgpack cannot distinguish list/tuple + tm.assert_almost_equal(tuple(x), x_rec) + + x_rec = self.encode_decode(tuple(x)) + tm.assert_almost_equal(tuple(x), x_rec) class TestBasic(TestPackers): @@ -286,30 +298,30 @@ def test_basic_index(self): for s, i in self.d.items(): i_rec = self.encode_decode(i) - self.assertTrue(i.equals(i_rec)) + self.assert_index_equal(i, i_rec) # datetime with no freq (GH5506) i = Index([Timestamp('20130101'), Timestamp('20130103')]) i_rec = self.encode_decode(i) - self.assertTrue(i.equals(i_rec)) + self.assert_index_equal(i, i_rec) # datetime with timezone i = Index([Timestamp('20130101 9:00:00'), Timestamp( '20130103 11:00:00')]).tz_localize('US/Eastern') i_rec = self.encode_decode(i) - self.assertTrue(i.equals(i_rec)) + self.assert_index_equal(i, i_rec) def test_multi_index(self): for s, i in self.mi.items(): i_rec = self.encode_decode(i) - self.assertTrue(i.equals(i_rec)) + self.assert_index_equal(i, i_rec) def test_unicode(self): i = tm.makeUnicodeIndex(100) i_rec = self.encode_decode(i) - self.assertTrue(i.equals(i_rec)) + self.assert_index_equal(i, i_rec) class TestSeries(TestPackers): @@ -659,14 +671,14 @@ def _test_small_strings_no_warn(self, compress): with tm.assert_produces_warning(None): empty_unpacked = self.encode_decode(empty, compress=compress) - np.testing.assert_array_equal(empty_unpacked, empty) + tm.assert_numpy_array_equal(empty_unpacked, empty) self.assertTrue(empty_unpacked.flags.writeable) char = np.array([ord(b'a')], dtype='uint8') with tm.assert_produces_warning(None): char_unpacked = self.encode_decode(char, compress=compress) - np.testing.assert_array_equal(char_unpacked, char) + tm.assert_numpy_array_equal(char_unpacked, char) self.assertTrue(char_unpacked.flags.writeable) # if this test fails I am sorry because the interpreter is now in a # bad state where b'a' points to 98 == ord(b'b'). @@ -676,7 +688,7 @@ def _test_small_strings_no_warn(self, compress): # always be the same (unless we were able to mutate the shared # character singleton in which case ord(b'a') == ord(b'b'). self.assertEqual(ord(b'a'), ord(u'a')) - np.testing.assert_array_equal( + tm.assert_numpy_array_equal( char_unpacked, np.array([ord(b'b')], dtype='uint8'), ) diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index 4ff0363d07df6..c12d6e02e3a2e 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -85,7 +85,7 @@ def compare_series_ts(self, result, expected, typ, version): tm.assert_series_equal(result, expected) tm.assert_equal(result.index.freq, expected.index.freq) tm.assert_equal(result.index.freq.normalize, False) - tm.assert_numpy_array_equal(result > 0, expected > 0) + tm.assert_series_equal(result > 0, expected > 0) # GH 9291 freq = result.index.freq @@ -108,6 +108,13 @@ def compare_series_dt_tz(self, result, expected, typ, version): else: tm.assert_series_equal(result, expected) + def compare_series_cat(self, result, expected, typ, version): + # Categorical.ordered is changed in < 0.16.0 + if LooseVersion(version) < '0.16.0': + tm.assert_series_equal(result, expected, check_categorical=False) + else: + tm.assert_series_equal(result, expected) + def compare_frame_dt_mixed_tzs(self, result, expected, typ, version): # 8260 # dtype is object < 0.17.0 @@ -117,6 +124,16 @@ def compare_frame_dt_mixed_tzs(self, result, expected, typ, version): else: tm.assert_frame_equal(result, expected) + def compare_frame_cat_onecol(self, result, expected, typ, version): + # Categorical.ordered is changed in < 0.16.0 + if LooseVersion(version) < '0.16.0': + tm.assert_frame_equal(result, expected, check_categorical=False) + else: + tm.assert_frame_equal(result, expected) + + def compare_frame_cat_and_float(self, result, expected, typ, version): + self.compare_frame_cat_onecol(result, expected, typ, version) + def compare_index_period(self, result, expected, typ, version): tm.assert_index_equal(result, expected) tm.assertIsInstance(result.freq, MonthEnd) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index d21189fe91a2a..9c13162bd774c 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -46,8 +46,8 @@ from distutils.version import LooseVersion -_default_compressor = LooseVersion(tables.__version__) >= '2.2' \ - and 'blosc' or 'zlib' +_default_compressor = ('blosc' if LooseVersion(tables.__version__) >= '2.2' + else 'zlib') _multiprocess_can_split_ = False @@ -1004,7 +1004,7 @@ def roundtrip(s, key='data', encoding='latin-1', nan_rep=''): nan_rep=nan_rep) retr = read_hdf(store, key) s_nan = s.replace(nan_rep, np.nan) - assert_series_equal(s_nan, retr) + assert_series_equal(s_nan, retr, check_categorical=False) for s in examples: roundtrip(s) @@ -4128,10 +4128,11 @@ def test_nan_selection_bug_4858(self): result = store.select('df', where='values>2.0') assert_frame_equal(result, expected) - def test_start_stop(self): + def test_start_stop_table(self): with ensure_clean_store(self.path) as store: + # table df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20))) store.append('df', df) @@ -4143,8 +4144,55 @@ def test_start_stop(self): # out of range result = store.select( 'df', [Term("columns=['A']")], start=30, stop=40) - assert(len(result) == 0) - assert(type(result) == DataFrame) + self.assertTrue(len(result) == 0) + expected = df.ix[30:40, ['A']] + tm.assert_frame_equal(result, expected) + + def test_start_stop_fixed(self): + + with ensure_clean_store(self.path) as store: + + # fixed, GH 8287 + df = DataFrame(dict(A=np.random.rand(20), + B=np.random.rand(20)), + index=pd.date_range('20130101', periods=20)) + store.put('df', df) + + result = store.select( + 'df', start=0, stop=5) + expected = df.iloc[0:5, :] + tm.assert_frame_equal(result, expected) + + result = store.select( + 'df', start=5, stop=10) + expected = df.iloc[5:10, :] + tm.assert_frame_equal(result, expected) + + # out of range + result = store.select( + 'df', start=30, stop=40) + expected = df.iloc[30:40, :] + tm.assert_frame_equal(result, expected) + + # series + s = df.A + store.put('s', s) + result = store.select('s', start=0, stop=5) + expected = s.iloc[0:5] + tm.assert_series_equal(result, expected) + + result = store.select('s', start=5, stop=10) + expected = s.iloc[5:10] + tm.assert_series_equal(result, expected) + + # sparse; not implemented + df = tm.makeDataFrame() + df.ix[3:5, 1:3] = np.nan + df.ix[8:10, -2] = np.nan + dfs = df.to_sparse() + store.put('dfs', dfs) + with self.assertRaises(NotImplementedError): + store.select('dfs', start=0, stop=5) def test_select_filter_corner(self): @@ -4829,6 +4877,9 @@ def test_read_nokey(self): df = DataFrame(np.random.rand(4, 5), index=list('abcd'), columns=list('ABCDE')) + + # Categorical dtype not supported for "fixed" format. So no need + # to test with that dtype in the dataframe here. with ensure_clean_path(self.path) as path: df.to_hdf(path, 'df', mode='a') reread = read_hdf(path) @@ -4836,6 +4887,60 @@ def test_read_nokey(self): df.to_hdf(path, 'df2', mode='a') self.assertRaises(ValueError, read_hdf, path) + def test_read_nokey_table(self): + # GH13231 + df = DataFrame({'i': range(5), + 'c': Series(list('abacd'), dtype='category')}) + + with ensure_clean_path(self.path) as path: + df.to_hdf(path, 'df', mode='a', format='table') + reread = read_hdf(path) + assert_frame_equal(df, reread) + df.to_hdf(path, 'df2', mode='a', format='table') + self.assertRaises(ValueError, read_hdf, path) + + def test_read_nokey_empty(self): + with ensure_clean_path(self.path) as path: + store = HDFStore(path) + store.close() + self.assertRaises(ValueError, read_hdf, path) + + def test_read_from_pathlib_path(self): + + # GH11773 + tm._skip_if_no_pathlib() + + from pathlib import Path + + expected = DataFrame(np.random.rand(4, 5), + index=list('abcd'), + columns=list('ABCDE')) + with ensure_clean_path(self.path) as filename: + path_obj = Path(filename) + + expected.to_hdf(path_obj, 'df', mode='a') + actual = read_hdf(path_obj, 'df') + + tm.assert_frame_equal(expected, actual) + + def test_read_from_py_localpath(self): + + # GH11773 + tm._skip_if_no_localpath() + + from py.path import local as LocalPath + + expected = DataFrame(np.random.rand(4, 5), + index=list('abcd'), + columns=list('ABCDE')) + with ensure_clean_path(self.path) as filename: + path_obj = LocalPath(filename) + + expected.to_hdf(path_obj, 'df', mode='a') + actual = read_hdf(path_obj, 'df') + + tm.assert_frame_equal(expected, actual) + class TestHDFComplexValues(Base): # GH10447 @@ -5196,7 +5301,7 @@ def test_fixed_offset_tz(self): with ensure_clean_store(self.path) as store: store['frame'] = frame recons = store['frame'] - self.assertTrue(recons.index.equals(rng)) + self.assert_index_equal(recons.index, rng) self.assertEqual(rng.tz, recons.index.tz) def test_store_timezone(self): diff --git a/pandas/io/tests/test_s3.py b/pandas/io/tests/test_s3.py new file mode 100644 index 0000000000000..8058698a906ea --- /dev/null +++ b/pandas/io/tests/test_s3.py @@ -0,0 +1,14 @@ +import nose +from pandas.util import testing as tm + +from pandas.io.common import _is_s3_url + + +class TestS3URL(tm.TestCase): + def test_is_s3_url(self): + self.assertTrue(_is_s3_url("s3://pandas/somethingelse.com")) + self.assertFalse(_is_s3_url("s4://pandas/somethingelse.com")) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index fe782bb86d1be..830c68d62efad 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -179,7 +179,7 @@ def test_read_dta2(self): w = [x for x in w if x.category is UserWarning] # should get warning for each call to read_dta - tm.assert_equal(len(w), 3) + self.assertEqual(len(w), 3) # buggy test because of the NaT comparison on certain platforms # Format 113 test fails since it does not support tc and tC formats @@ -234,10 +234,11 @@ def test_read_dta4(self): expected = pd.concat([expected[col].astype('category') for col in expected], axis=1) - tm.assert_frame_equal(parsed_113, expected) - tm.assert_frame_equal(parsed_114, expected) - tm.assert_frame_equal(parsed_115, expected) - tm.assert_frame_equal(parsed_117, expected) + # stata doesn't save .category metadata + tm.assert_frame_equal(parsed_113, expected, check_categorical=False) + tm.assert_frame_equal(parsed_114, expected, check_categorical=False) + tm.assert_frame_equal(parsed_115, expected, check_categorical=False) + tm.assert_frame_equal(parsed_117, expected, check_categorical=False) # File containing strls def test_read_dta12(self): @@ -374,7 +375,7 @@ def test_read_write_dta11(self): with warnings.catch_warnings(record=True) as w: original.to_stata(path, None) # should get a warning for that format. - tm.assert_equal(len(w), 1) + self.assertEqual(len(w), 1) written_and_read_again = self.read_dta(path) tm.assert_frame_equal( @@ -402,7 +403,7 @@ def test_read_write_dta12(self): with warnings.catch_warnings(record=True) as w: original.to_stata(path, None) # should get a warning for that format. - tm.assert_equal(len(w), 1) + self.assertEqual(len(w), 1) written_and_read_again = self.read_dta(path) tm.assert_frame_equal( @@ -872,8 +873,8 @@ def test_categorical_writing(self): # Silence warnings original.to_stata(path) written_and_read_again = self.read_dta(path) - tm.assert_frame_equal( - written_and_read_again.set_index('index'), expected) + res = written_and_read_again.set_index('index') + tm.assert_frame_equal(res, expected, check_categorical=False) def test_categorical_warnings_and_errors(self): # Warning for non-string labels @@ -903,7 +904,7 @@ def test_categorical_warnings_and_errors(self): with warnings.catch_warnings(record=True) as w: original.to_stata(path) # should get a warning for mixed content - tm.assert_equal(len(w), 1) + self.assertEqual(len(w), 1) def test_categorical_with_stata_missing_values(self): values = [['a' + str(i)] for i in range(120)] @@ -915,8 +916,8 @@ def test_categorical_with_stata_missing_values(self): with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) - tm.assert_frame_equal( - written_and_read_again.set_index('index'), original) + res = written_and_read_again.set_index('index') + tm.assert_frame_equal(res, original, check_categorical=False) def test_categorical_order(self): # Directly construct using expected codes @@ -945,8 +946,8 @@ def test_categorical_order(self): # Read with and with out categoricals, ensure order is identical parsed_115 = read_stata(self.dta19_115) parsed_117 = read_stata(self.dta19_117) - tm.assert_frame_equal(expected, parsed_115) - tm.assert_frame_equal(expected, parsed_117) + tm.assert_frame_equal(expected, parsed_115, check_categorical=False) + tm.assert_frame_equal(expected, parsed_117, check_categorical=False) # Check identity of codes for col in expected: @@ -969,8 +970,10 @@ def test_categorical_sorting(self): categories = ["Poor", "Fair", "Good", "Very good", "Excellent"] cat = pd.Categorical.from_codes(codes=codes, categories=categories) expected = pd.Series(cat, name='srh') - tm.assert_series_equal(expected, parsed_115["srh"]) - tm.assert_series_equal(expected, parsed_117["srh"]) + tm.assert_series_equal(expected, parsed_115["srh"], + check_categorical=False) + tm.assert_series_equal(expected, parsed_117["srh"], + check_categorical=False) def test_categorical_ordering(self): parsed_115 = read_stata(self.dta19_115) @@ -983,10 +986,10 @@ def test_categorical_ordering(self): for col in parsed_115: if not is_categorical_dtype(parsed_115[col]): continue - tm.assert_equal(True, parsed_115[col].cat.ordered) - tm.assert_equal(True, parsed_117[col].cat.ordered) - tm.assert_equal(False, parsed_115_unordered[col].cat.ordered) - tm.assert_equal(False, parsed_117_unordered[col].cat.ordered) + self.assertEqual(True, parsed_115[col].cat.ordered) + self.assertEqual(True, parsed_117[col].cat.ordered) + self.assertEqual(False, parsed_115_unordered[col].cat.ordered) + self.assertEqual(False, parsed_117_unordered[col].cat.ordered) def test_read_chunks_117(self): files_117 = [self.dta1_117, self.dta2_117, self.dta3_117, @@ -1021,7 +1024,8 @@ def test_read_chunks_117(self): from_frame = parsed.iloc[pos:pos + chunksize, :] tm.assert_frame_equal( from_frame, chunk, check_dtype=False, - check_datetimelike_compat=True) + check_datetimelike_compat=True, + check_categorical=False) pos += chunksize itr.close() @@ -1087,7 +1091,8 @@ def test_read_chunks_115(self): from_frame = parsed.iloc[pos:pos + chunksize, :] tm.assert_frame_equal( from_frame, chunk, check_dtype=False, - check_datetimelike_compat=True) + check_datetimelike_compat=True, + check_categorical=False) pos += chunksize itr.close() diff --git a/pandas/io/tests/test_wb.py b/pandas/io/tests/test_wb.py index 58386c3f1c145..42884b19de03a 100644 --- a/pandas/io/tests/test_wb.py +++ b/pandas/io/tests/test_wb.py @@ -6,7 +6,6 @@ from pandas.compat import u from pandas.util.testing import network from pandas.util.testing import assert_frame_equal -from numpy.testing.decorators import slow import pandas.util.testing as tm # deprecated @@ -15,7 +14,7 @@ class TestWB(tm.TestCase): - @slow + @tm.slow @network def test_wdi_search(self): @@ -26,7 +25,7 @@ def test_wdi_search(self): result = search('gdp.*capita.*constant') self.assertTrue(result.name.str.contains('GDP').any()) - @slow + @tm.slow @network def test_wdi_download(self): @@ -55,7 +54,7 @@ def test_wdi_download(self): expected.index = result.index assert_frame_equal(result, pandas.DataFrame(expected)) - @slow + @tm.slow @network def test_wdi_download_w_retired_indicator(self): @@ -85,7 +84,7 @@ def test_wdi_download_w_retired_indicator(self): if len(result) > 0: raise nose.SkipTest("Invalid results") - @slow + @tm.slow @network def test_wdi_download_w_crash_inducing_countrycode(self): @@ -103,7 +102,7 @@ def test_wdi_download_w_crash_inducing_countrycode(self): if len(result) > 0: raise nose.SkipTest("Invalid results") - @slow + @tm.slow @network def test_wdi_get_countries(self): result = get_countries() diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 328166168a3fc..a9c7f93097f1b 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -493,7 +493,21 @@ def fast_unique_multiple_list(list lists): @cython.wraparound(False) @cython.boundscheck(False) -def fast_unique_multiple_list_gen(object gen): +def fast_unique_multiple_list_gen(object gen, bint sort=True): + """ + Generate a list of unique values from a generator of lists. + + Parameters + ---------- + gen : generator object + A generator of lists from which the unique list is created + sort : boolean + Whether or not to sort the resulting unique list + + Returns + ------- + unique_list : list of unique values + """ cdef: list buf Py_ssize_t j, n @@ -508,11 +522,11 @@ def fast_unique_multiple_list_gen(object gen): if val not in table: table[val] = stub uniques.append(val) - - try: - uniques.sort() - except Exception: - pass + if sort: + try: + uniques.sort() + except Exception: + pass return uniques diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 94d7f36f4f205..d7ddaee658fe7 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -1018,7 +1018,7 @@ cdef class TextReader: col_res = _maybe_upcast(col_res) if issubclass(col_res.dtype.type, np.integer) and self.compact_ints: - col_res = downcast_int64(col_res, self.use_unsigned) + col_res = lib.downcast_int64(col_res, na_values, self.use_unsigned) if col_res is None: raise CParserError('Unable to parse column %d' % i) @@ -1501,6 +1501,7 @@ cdef inline void _to_fw_string_nogil(parser_t *parser, int col, int line_start, data += width cdef char* cinf = b'inf' +cdef char* cposinf = b'+inf' cdef char* cneginf = b'-inf' cdef _try_double(parser_t *parser, int col, int line_start, int line_end, @@ -1562,7 +1563,7 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci, parser.thousands, 1) if errno != 0 or p_end[0] or p_end == word: - if strcasecmp(word, cinf) == 0: + if strcasecmp(word, cinf) == 0 or strcasecmp(word, cposinf) == 0: data[0] = INF elif strcasecmp(word, cneginf) == 0: data[0] = NEGINF @@ -1581,7 +1582,7 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci, parser.thousands, 1) if errno != 0 or p_end[0] or p_end == word: - if strcasecmp(word, cinf) == 0: + if strcasecmp(word, cinf) == 0 or strcasecmp(word, cposinf) == 0: data[0] = INF elif strcasecmp(word, cneginf) == 0: data[0] = NEGINF @@ -1865,76 +1866,6 @@ cdef raise_parser_error(object base, parser_t *parser): raise CParserError(message) -def downcast_int64(ndarray[int64_t] arr, bint use_unsigned=0): - cdef: - Py_ssize_t i, n = len(arr) - int64_t mx = INT64_MIN + 1, mn = INT64_MAX - int64_t NA = na_values[np.int64] - int64_t val - ndarray[uint8_t] mask - int na_count = 0 - - _mask = np.empty(n, dtype=bool) - mask = _mask.view(np.uint8) - - for i in range(n): - val = arr[i] - - if val == NA: - mask[i] = 1 - na_count += 1 - continue - - # not NA - mask[i] = 0 - - if val > mx: - mx = val - - if val < mn: - mn = val - - if mn >= 0 and use_unsigned: - if mx <= UINT8_MAX - 1: - result = arr.astype(np.uint8) - if na_count: - np.putmask(result, _mask, na_values[np.uint8]) - return result - - if mx <= UINT16_MAX - 1: - result = arr.astype(np.uint16) - if na_count: - np.putmask(result, _mask, na_values[np.uint16]) - return result - - if mx <= UINT32_MAX - 1: - result = arr.astype(np.uint32) - if na_count: - np.putmask(result, _mask, na_values[np.uint32]) - return result - - else: - if mn >= INT8_MIN + 1 and mx <= INT8_MAX: - result = arr.astype(np.int8) - if na_count: - np.putmask(result, _mask, na_values[np.int8]) - return result - - if mn >= INT16_MIN + 1 and mx <= INT16_MAX: - result = arr.astype(np.int16) - if na_count: - np.putmask(result, _mask, na_values[np.int16]) - return result - - if mn >= INT32_MIN + 1 and mx <= INT32_MAX: - result = arr.astype(np.int32) - if na_count: - np.putmask(result, _mask, na_values[np.int32]) - return result - - return arr - - def _concatenate_chunks(list chunks): cdef: list names = list(chunks[0].keys()) diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index e114bee87ca27..0312fb023f7fd 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -152,9 +152,17 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer', # Create array, do *not* copy data by default if copy: - subarr = np.array(values, dtype=dtype, copy=True) + try: + # ToDo: Can remove this error handling when we actually + # support other dtypes + subarr = np.array(values, dtype=dtype, copy=True) + except ValueError: + subarr = np.array(values, copy=True) else: - subarr = np.asarray(values, dtype=dtype) + try: + subarr = np.asarray(values, dtype=dtype) + except ValueError: + subarr = np.asarray(values) # if we have a bool type, make sure that we have a bool fill_value if ((dtype is not None and issubclass(dtype.type, np.bool_)) or @@ -437,12 +445,12 @@ def count(self): @property def _null_fill_value(self): - return np.isnan(self.fill_value) + return com.isnull(self.fill_value) @property def _valid_sp_values(self): sp_vals = self.sp_values - mask = np.isfinite(sp_vals) + mask = com.notnull(sp_vals) return sp_vals[mask] @Appender(_index_shared_docs['fillna'] % _sparray_doc_kwargs) @@ -616,8 +624,8 @@ def make_sparse(arr, kind='block', fill_value=nan): if arr.ndim > 1: raise TypeError("expected dimension <= 1 data") - if np.isnan(fill_value): - mask = ~np.isnan(arr) + if com.isnull(fill_value): + mask = com.notnull(arr) else: mask = arr != fill_value diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index a783a7c596955..519068b97a010 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -5,14 +5,13 @@ # pylint: disable=E1101,E1103,W0231 -from numpy import nan, ndarray import numpy as np import warnings import operator from pandas.compat.numpy import function as nv from pandas.core.common import isnull, _values_from_object, _maybe_match_name -from pandas.core.index import Index, _ensure_index +from pandas.core.index import Index, _ensure_index, InvalidIndexError from pandas.core.series import Series from pandas.core.frame import DataFrame from pandas.core.internals import SingleBlockManager @@ -135,7 +134,7 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block', if is_sparse_array: fill_value = data.fill_value else: - fill_value = nan + fill_value = np.nan if is_sparse_array: if isinstance(data, SparseSeries) and index is None: @@ -393,8 +392,10 @@ def _get_val_at(self, loc): def __getitem__(self, key): try: - return self._get_val_at(self.index.get_loc(key)) + return self.index.get_value(self, key) + except InvalidIndexError: + pass except KeyError: if isinstance(key, (int, np.integer)): return self._get_val_at(key) @@ -406,13 +407,12 @@ def __getitem__(self, key): # Could not hash item, must be array-like? pass - # is there a case where this would NOT be an ndarray? - # need to find an example, I took out the case for now - key = _values_from_object(key) - dataSlice = self.values[key] - new_index = Index(self.index.view(ndarray)[key]) - return self._constructor(dataSlice, index=new_index).__finalize__(self) + if self.index.nlevels > 1 and isinstance(key, tuple): + # to handle MultiIndex labels + key = self.index.get_loc(key) + return self._constructor(self.values[key], + index=self.index[key]).__finalize__(self) def _get_values(self, indexer): try: diff --git a/pandas/sparse/tests/test_array.py b/pandas/sparse/tests/test_array.py index 26d018c56a8a8..dd2126d0f52d2 100644 --- a/pandas/sparse/tests/test_array.py +++ b/pandas/sparse/tests/test_array.py @@ -46,6 +46,17 @@ def test_constructor_dtype(self): self.assertEqual(arr.dtype, np.int64) self.assertEqual(arr.fill_value, 0) + def test_constructor_object_dtype(self): + # GH 11856 + arr = SparseArray(['A', 'A', np.nan, 'B'], dtype=np.object) + self.assertEqual(arr.dtype, np.object) + self.assertTrue(np.isnan(arr.fill_value)) + + arr = SparseArray(['A', 'A', np.nan, 'B'], dtype=np.object, + fill_value='A') + self.assertEqual(arr.dtype, np.object) + self.assertEqual(arr.fill_value, 'A') + def test_constructor_spindex_dtype(self): arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2])) tm.assert_sp_array_equal(arr, SparseArray([np.nan, 1, 2, np.nan])) diff --git a/pandas/sparse/tests/test_format.py b/pandas/sparse/tests/test_format.py new file mode 100644 index 0000000000000..9bdc1fdd101ea --- /dev/null +++ b/pandas/sparse/tests/test_format.py @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- +from __future__ import print_function + +import numpy as np +import pandas as pd + +import pandas.util.testing as tm +from pandas.compat import (is_platform_windows, + is_platform_32bit) +from pandas.core.config import option_context + + +use_32bit_repr = is_platform_windows() or is_platform_32bit() + + +class TestSeriesFormatting(tm.TestCase): + + _multiprocess_can_split_ = True + + @property + def dtype_format_for_platform(self): + return '' if use_32bit_repr else ', dtype=int32' + + def test_sparse_max_row(self): + s = pd.Series([1, np.nan, np.nan, 3, np.nan]).to_sparse() + result = repr(s) + dfm = self.dtype_format_for_platform + exp = ("0 1.0\n1 NaN\n2 NaN\n3 3.0\n" + "4 NaN\ndtype: float64\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dfm)) + self.assertEqual(result, exp) + + with option_context("display.max_rows", 3): + # GH 10560 + result = repr(s) + exp = ("0 1.0\n ... \n4 NaN\n" + "dtype: float64\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dfm)) + self.assertEqual(result, exp) + + def test_sparse_mi_max_row(self): + idx = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 0), + ('C', 0), ('C', 1), ('C', 2)]) + s = pd.Series([1, np.nan, np.nan, 3, np.nan, np.nan], + index=idx).to_sparse() + result = repr(s) + dfm = self.dtype_format_for_platform + exp = ("A 0 1.0\n 1 NaN\nB 0 NaN\n" + "C 0 3.0\n 1 NaN\n 2 NaN\n" + "dtype: float64\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dfm)) + self.assertEqual(result, exp) + + with option_context("display.max_rows", 3): + # GH 13144 + result = repr(s) + exp = ("A 0 1.0\n ... \nC 2 NaN\n" + "dtype: float64\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dfm)) + self.assertEqual(result, exp) diff --git a/pandas/sparse/tests/test_frame.py b/pandas/sparse/tests/test_frame.py index fde4ad15e1185..43d35a4e7f72e 100644 --- a/pandas/sparse/tests/test_frame.py +++ b/pandas/sparse/tests/test_frame.py @@ -97,8 +97,11 @@ def test_constructor(self): # constructed zframe from matrix above self.assertEqual(self.zframe['A'].fill_value, 0) - tm.assert_almost_equal([0, 0, 0, 0, 1, 2, 3, 4, 5, 6], - self.zframe['A'].values) + tm.assert_numpy_array_equal(pd.SparseArray([1., 2., 3., 4., 5., 6.]), + self.zframe['A'].values) + tm.assert_numpy_array_equal(np.array([0., 0., 0., 0., 1., 2., + 3., 4., 5., 6.]), + self.zframe['A'].to_dense().values) # construct no data sdf = SparseDataFrame(columns=np.arange(10), index=np.arange(10)) @@ -380,8 +383,8 @@ def test_set_value(self): res2 = res.set_value('foobar', 'qux', 1.5) self.assertIsNot(res2, res) - self.assert_numpy_array_equal(res2.columns, - list(self.frame.columns) + ['qux']) + self.assert_index_equal(res2.columns, + pd.Index(list(self.frame.columns) + ['qux'])) self.assertEqual(res2.get_value('foobar', 'qux'), 1.5) def test_fancy_index_misc(self): @@ -407,7 +410,7 @@ def test_getitem_overload(self): subindex = self.frame.index[indexer] subframe = self.frame[indexer] - self.assert_numpy_array_equal(subindex, subframe.index) + self.assert_index_equal(subindex, subframe.index) self.assertRaises(Exception, self.frame.__getitem__, indexer[:-1]) def test_setitem(self): diff --git a/pandas/sparse/tests/test_groupby.py b/pandas/sparse/tests/test_groupby.py new file mode 100644 index 0000000000000..0cb33f4ea0a56 --- /dev/null +++ b/pandas/sparse/tests/test_groupby.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +import numpy as np +import pandas as pd +import pandas.util.testing as tm + + +class TestSparseGroupBy(tm.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + self.dense = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8), + 'E': [np.nan, np.nan, 1, 2, + np.nan, 1, np.nan, np.nan]}) + self.sparse = self.dense.to_sparse() + + def test_first_last_nth(self): + # tests for first / last / nth + sparse_grouped = self.sparse.groupby('A') + dense_grouped = self.dense.groupby('A') + + tm.assert_frame_equal(sparse_grouped.first(), + dense_grouped.first()) + tm.assert_frame_equal(sparse_grouped.last(), + dense_grouped.last()) + tm.assert_frame_equal(sparse_grouped.nth(1), + dense_grouped.nth(1)) + + def test_aggfuncs(self): + sparse_grouped = self.sparse.groupby('A') + dense_grouped = self.dense.groupby('A') + + tm.assert_frame_equal(sparse_grouped.mean(), + dense_grouped.mean()) + + # ToDo: sparse sum includes str column + # tm.assert_frame_equal(sparse_grouped.sum(), + # dense_grouped.sum()) + + tm.assert_frame_equal(sparse_grouped.count(), + dense_grouped.count()) diff --git a/pandas/sparse/tests/test_indexing.py b/pandas/sparse/tests/test_indexing.py index ca2996941aef7..1f88d22bd8f93 100644 --- a/pandas/sparse/tests/test_indexing.py +++ b/pandas/sparse/tests/test_indexing.py @@ -10,9 +10,13 @@ class TestSparseSeriesIndexing(tm.TestCase): _multiprocess_can_split_ = True + def setUp(self): + self.orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) + self.sparse = self.orig.to_sparse() + def test_getitem(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) - sparse = orig.to_sparse() + orig = self.orig + sparse = self.sparse self.assertEqual(sparse[0], 1) self.assertTrue(np.isnan(sparse[1])) @@ -33,8 +37,9 @@ def test_getitem(self): tm.assert_sp_series_equal(result, exp) def test_getitem_slice(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) - sparse = orig.to_sparse() + orig = self.orig + sparse = self.sparse + tm.assert_sp_series_equal(sparse[:2], orig[:2].to_sparse()) tm.assert_sp_series_equal(sparse[4:2], orig[4:2].to_sparse()) tm.assert_sp_series_equal(sparse[::2], orig[::2].to_sparse()) @@ -84,8 +89,8 @@ def test_getitem_slice_fill_value(self): orig[-5:].to_sparse(fill_value=0)) def test_loc(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) - sparse = orig.to_sparse() + orig = self.orig + sparse = self.sparse self.assertEqual(sparse.loc[0], 1) self.assertTrue(np.isnan(sparse.loc[1])) @@ -154,10 +159,17 @@ def test_loc_index_fill_value(self): tm.assert_sp_series_equal(result, exp) def test_loc_slice(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) - sparse = orig.to_sparse() + orig = self.orig + sparse = self.sparse tm.assert_sp_series_equal(sparse.loc[2:], orig.loc[2:].to_sparse()) + def test_loc_slice_index_fill_value(self): + orig = pd.Series([1, np.nan, 0, 3, 0], index=list('ABCDE')) + sparse = orig.to_sparse(fill_value=0) + + tm.assert_sp_series_equal(sparse.loc['C':], + orig.loc['C':].to_sparse(fill_value=0)) + def test_loc_slice_fill_value(self): orig = pd.Series([1, np.nan, 0, 3, 0]) sparse = orig.to_sparse(fill_value=0) @@ -165,8 +177,8 @@ def test_loc_slice_fill_value(self): orig.loc[2:].to_sparse(fill_value=0)) def test_iloc(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) - sparse = orig.to_sparse() + orig = self.orig + sparse = self.sparse self.assertEqual(sparse.iloc[3], 3) self.assertTrue(np.isnan(sparse.iloc[2])) @@ -234,8 +246,9 @@ def test_at_fill_value(self): self.assertEqual(sparse.at['e'], orig.at['e']) def test_iat(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) - sparse = orig.to_sparse() + orig = self.orig + sparse = self.sparse + self.assertEqual(sparse.iat[0], orig.iat[0]) self.assertTrue(np.isnan(sparse.iat[1])) self.assertTrue(np.isnan(sparse.iat[2])) @@ -356,6 +369,111 @@ def test_reindex_fill_value(self): tm.assert_sp_series_equal(res, exp) +class TestSparseSeriesMultiIndexing(TestSparseSeriesIndexing): + + _multiprocess_can_split_ = True + + def setUp(self): + # Mi with duplicated values + idx = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 0), + ('C', 0), ('C', 1)]) + self.orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=idx) + self.sparse = self.orig.to_sparse() + + def test_getitem_multi(self): + orig = self.orig + sparse = self.sparse + + self.assertEqual(sparse[0], orig[0]) + self.assertTrue(np.isnan(sparse[1])) + self.assertEqual(sparse[3], orig[3]) + + tm.assert_sp_series_equal(sparse['A'], orig['A'].to_sparse()) + tm.assert_sp_series_equal(sparse['B'], orig['B'].to_sparse()) + + result = sparse[[1, 3, 4]] + exp = orig[[1, 3, 4]].to_sparse() + tm.assert_sp_series_equal(result, exp) + + # dense array + result = sparse[orig % 2 == 1] + exp = orig[orig % 2 == 1].to_sparse() + tm.assert_sp_series_equal(result, exp) + + # sparse array (actuary it coerces to normal Series) + result = sparse[sparse % 2 == 1] + exp = orig[orig % 2 == 1].to_sparse() + tm.assert_sp_series_equal(result, exp) + + def test_getitem_multi_tuple(self): + orig = self.orig + sparse = self.sparse + + self.assertEqual(sparse['C', 0], orig['C', 0]) + self.assertTrue(np.isnan(sparse['A', 1])) + self.assertTrue(np.isnan(sparse['B', 0])) + + def test_getitems_slice_multi(self): + orig = self.orig + sparse = self.sparse + + tm.assert_sp_series_equal(sparse[2:], orig[2:].to_sparse()) + tm.assert_sp_series_equal(sparse.loc['B':], orig.loc['B':].to_sparse()) + tm.assert_sp_series_equal(sparse.loc['C':], orig.loc['C':].to_sparse()) + + tm.assert_sp_series_equal(sparse.loc['A':'B'], + orig.loc['A':'B'].to_sparse()) + tm.assert_sp_series_equal(sparse.loc[:'B'], orig.loc[:'B'].to_sparse()) + + def test_loc(self): + # need to be override to use different label + orig = self.orig + sparse = self.sparse + + tm.assert_sp_series_equal(sparse.loc['A'], + orig.loc['A'].to_sparse()) + tm.assert_sp_series_equal(sparse.loc['B'], + orig.loc['B'].to_sparse()) + + result = sparse.loc[[1, 3, 4]] + exp = orig.loc[[1, 3, 4]].to_sparse() + tm.assert_sp_series_equal(result, exp) + + # exceeds the bounds + result = sparse.loc[[1, 3, 4, 5]] + exp = orig.loc[[1, 3, 4, 5]].to_sparse() + tm.assert_sp_series_equal(result, exp) + + # dense array + result = sparse.loc[orig % 2 == 1] + exp = orig.loc[orig % 2 == 1].to_sparse() + tm.assert_sp_series_equal(result, exp) + + # sparse array (actuary it coerces to normal Series) + result = sparse.loc[sparse % 2 == 1] + exp = orig.loc[orig % 2 == 1].to_sparse() + tm.assert_sp_series_equal(result, exp) + + def test_loc_multi_tuple(self): + orig = self.orig + sparse = self.sparse + + self.assertEqual(sparse.loc['C', 0], orig.loc['C', 0]) + self.assertTrue(np.isnan(sparse.loc['A', 1])) + self.assertTrue(np.isnan(sparse.loc['B', 0])) + + def test_loc_slice(self): + orig = self.orig + sparse = self.sparse + tm.assert_sp_series_equal(sparse.loc['A':], orig.loc['A':].to_sparse()) + tm.assert_sp_series_equal(sparse.loc['B':], orig.loc['B':].to_sparse()) + tm.assert_sp_series_equal(sparse.loc['C':], orig.loc['C':].to_sparse()) + + tm.assert_sp_series_equal(sparse.loc['A':'B'], + orig.loc['A':'B'].to_sparse()) + tm.assert_sp_series_equal(sparse.loc[:'B'], orig.loc[:'B'].to_sparse()) + + class TestSparseDataFrameIndexing(tm.TestCase): _multiprocess_can_split_ = True diff --git a/pandas/sparse/tests/test_libsparse.py b/pandas/sparse/tests/test_libsparse.py index 352355fd55c23..11bf980a99fec 100644 --- a/pandas/sparse/tests/test_libsparse.py +++ b/pandas/sparse/tests/test_libsparse.py @@ -3,7 +3,6 @@ import nose # noqa import numpy as np import operator -from numpy.testing import assert_equal import pandas.util.testing as tm from pandas import compat @@ -51,14 +50,17 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen): yindex = BlockIndex(TEST_LENGTH, yloc, ylen) bresult = xindex.make_union(yindex) assert (isinstance(bresult, BlockIndex)) - assert_equal(bresult.blocs, eloc) - assert_equal(bresult.blengths, elen) + tm.assert_numpy_array_equal(bresult.blocs, + np.array(eloc, dtype=np.int32)) + tm.assert_numpy_array_equal(bresult.blengths, + np.array(elen, dtype=np.int32)) ixindex = xindex.to_int_index() iyindex = yindex.to_int_index() iresult = ixindex.make_union(iyindex) assert (isinstance(iresult, IntIndex)) - assert_equal(iresult.indices, bresult.to_int_index().indices) + tm.assert_numpy_array_equal(iresult.indices, + bresult.to_int_index().indices) """ x: ---- @@ -411,7 +413,8 @@ def test_to_int_index(self): block = BlockIndex(20, locs, lengths) dense = block.to_int_index() - assert_equal(dense.indices, exp_inds) + tm.assert_numpy_array_equal(dense.indices, + np.array(exp_inds, dtype=np.int32)) def test_to_block_index(self): index = BlockIndex(10, [0, 5], [4, 5]) @@ -489,7 +492,7 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen): ydindex, yfill) self.assertTrue(rb_index.to_int_index().equals(ri_index)) - assert_equal(result_block_vals, result_int_vals) + tm.assert_numpy_array_equal(result_block_vals, result_int_vals) # check versus Series... xseries = Series(x, xdindex.indices) @@ -501,8 +504,9 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen): series_result = python_op(xseries, yseries) series_result = series_result.reindex(ri_index.indices) - assert_equal(result_block_vals, series_result.values) - assert_equal(result_int_vals, series_result.values) + tm.assert_numpy_array_equal(result_block_vals, + series_result.values) + tm.assert_numpy_array_equal(result_int_vals, series_result.values) check_cases(_check_case) diff --git a/pandas/sparse/tests/test_panel.py b/pandas/sparse/tests/test_panel.py index 89a90f5be40e6..e988ddebd92f0 100644 --- a/pandas/sparse/tests/test_panel.py +++ b/pandas/sparse/tests/test_panel.py @@ -121,7 +121,8 @@ def _compare_with_dense(panel): dlp = panel.to_dense().to_frame() self.assert_numpy_array_equal(slp.values, dlp.values) - self.assertTrue(slp.index.equals(dlp.index)) + self.assert_index_equal(slp.index, dlp.index, + check_names=False) _compare_with_dense(self.panel) _compare_with_dense(self.panel.reindex(items=['ItemA'])) diff --git a/pandas/sparse/tests/test_pivot.py b/pandas/sparse/tests/test_pivot.py new file mode 100644 index 0000000000000..482a99a96194f --- /dev/null +++ b/pandas/sparse/tests/test_pivot.py @@ -0,0 +1,52 @@ +import numpy as np +import pandas as pd +import pandas.util.testing as tm + + +class TestPivotTable(tm.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + self.dense = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8), + 'E': [np.nan, np.nan, 1, 2, + np.nan, 1, np.nan, np.nan]}) + self.sparse = self.dense.to_sparse() + + def test_pivot_table(self): + res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', + values='C') + res_dense = pd.pivot_table(self.dense, index='A', columns='B', + values='C') + tm.assert_frame_equal(res_sparse, res_dense) + + res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', + values='E') + res_dense = pd.pivot_table(self.dense, index='A', columns='B', + values='E') + tm.assert_frame_equal(res_sparse, res_dense) + + res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', + values='E', aggfunc='mean') + res_dense = pd.pivot_table(self.dense, index='A', columns='B', + values='E', aggfunc='mean') + tm.assert_frame_equal(res_sparse, res_dense) + + # ToDo: sum doesn't handle nan properly + # res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', + # values='E', aggfunc='sum') + # res_dense = pd.pivot_table(self.dense, index='A', columns='B', + # values='E', aggfunc='sum') + # tm.assert_frame_equal(res_sparse, res_dense) + + def test_pivot_table_multi(self): + res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', + values=['D', 'E']) + res_dense = pd.pivot_table(self.dense, index='A', columns='B', + values=['D', 'E']) + tm.assert_frame_equal(res_sparse, res_dense) diff --git a/pandas/sparse/tests/test_series.py b/pandas/sparse/tests/test_series.py index 44bc51077ef3e..27112319ea915 100644 --- a/pandas/sparse/tests/test_series.py +++ b/pandas/sparse/tests/test_series.py @@ -5,7 +5,6 @@ from numpy import nan import numpy as np import pandas as pd -from numpy.testing import assert_equal from pandas import Series, DataFrame, bdate_range from pandas.core.datetools import BDay @@ -148,20 +147,23 @@ def test_series_density(self): def test_sparse_to_dense(self): arr, index = _test_data1() series = self.bseries.to_dense() - assert_equal(series, arr) + tm.assert_series_equal(series, Series(arr, name='bseries')) series = self.bseries.to_dense(sparse_only=True) - assert_equal(series, arr[np.isfinite(arr)]) + + indexer = np.isfinite(arr) + exp = Series(arr[indexer], index=index[indexer], name='bseries') + tm.assert_series_equal(series, exp) series = self.iseries.to_dense() - assert_equal(series, arr) + tm.assert_series_equal(series, Series(arr, name='iseries')) arr, index = _test_data1_zero() series = self.zbseries.to_dense() - assert_equal(series, arr) + tm.assert_series_equal(series, Series(arr, name='zbseries')) series = self.ziseries.to_dense() - assert_equal(series, arr) + tm.assert_series_equal(series, Series(arr)) def test_to_dense_fill_value(self): s = pd.Series([1, np.nan, np.nan, 3, np.nan]) @@ -225,8 +227,8 @@ def test_constructor(self): tm.assertIsInstance(self.iseries.sp_index, IntIndex) self.assertEqual(self.zbseries.fill_value, 0) - assert_equal(self.zbseries.values.values, - self.bseries.to_dense().fillna(0).values) + tm.assert_numpy_array_equal(self.zbseries.values.values, + self.bseries.to_dense().fillna(0).values) # pass SparseSeries def _check_const(sparse, name): @@ -252,7 +254,7 @@ def _check_const(sparse, name): # pass Series bseries2 = SparseSeries(self.bseries.to_dense()) - assert_equal(self.bseries.sp_values, bseries2.sp_values) + tm.assert_numpy_array_equal(self.bseries.sp_values, bseries2.sp_values) # pass dict? @@ -292,7 +294,7 @@ def test_constructor_ndarray(self): def test_constructor_nonnan(self): arr = [0, 0, 0, nan, nan] sp_series = SparseSeries(arr, fill_value=0) - assert_equal(sp_series.values.values, arr) + tm.assert_numpy_array_equal(sp_series.values.values, np.array(arr)) self.assertEqual(len(sp_series), 5) self.assertEqual(sp_series.shape, (5, )) @@ -724,9 +726,9 @@ def test_dropna(self): expected = sp.to_dense().valid() expected = expected[expected != 0] - - tm.assert_almost_equal(sp_valid.values, expected.values) - self.assertTrue(sp_valid.index.equals(expected.index)) + exp_arr = pd.SparseArray(expected.values, fill_value=0, kind='block') + tm.assert_sp_array_equal(sp_valid.values, exp_arr) + self.assert_index_equal(sp_valid.index, expected.index) self.assertEqual(len(sp_valid.sp_values), 2) result = self.bseries.dropna() @@ -1019,6 +1021,15 @@ def test_from_coo_nodense_index(self): check = check.dropna().to_sparse() tm.assert_sp_series_equal(ss, check) + def test_from_coo_long_repr(self): + # GH 13114 + # test it doesn't raise error. Formatting is tested in test_format + tm._skip_if_no_scipy() + import scipy.sparse + + sparse = SparseSeries.from_coo(scipy.sparse.rand(350, 18)) + repr(sparse) + def _run_test(self, ss, kwargs, check): results = ss.to_coo(**kwargs) self._check_results_to_coo(results, check) @@ -1031,8 +1042,7 @@ def _run_test(self, ss, kwargs, check): results = (results[0].T, results[2], results[1]) self._check_results_to_coo(results, check) - @staticmethod - def _check_results_to_coo(results, check): + def _check_results_to_coo(self, results, check): (A, il, jl) = results (A_result, il_result, jl_result) = check # convert to dense and compare @@ -1040,8 +1050,8 @@ def _check_results_to_coo(results, check): # or compare directly as difference of sparse # assert(abs(A - A_result).max() < 1e-12) # max is failing in python # 2.6 - assert_equal(il, il_result) - assert_equal(jl, jl_result) + self.assertEqual(il, il_result) + self.assertEqual(jl, jl_result) def test_concat(self): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 843031fafa1a9..262e036ff44f1 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -6,6 +6,20 @@ iNaT = util.get_nat() cdef bint PY2 = sys.version_info[0] == 2 +cdef extern from "headers/stdint.h": + enum: UINT8_MAX + enum: UINT16_MAX + enum: UINT32_MAX + enum: UINT64_MAX + enum: INT8_MIN + enum: INT8_MAX + enum: INT16_MIN + enum: INT16_MAX + enum: INT32_MAX + enum: INT32_MIN + enum: INT64_MAX + enum: INT64_MIN + # core.common import for fast inference checks def is_float(object obj): return util.is_float_object(obj) @@ -569,7 +583,7 @@ def maybe_convert_numeric(object[:] values, set na_values, for i in range(n): val = values[i] - if val in na_values: + if val.__hash__ is not None and val in na_values: floats[i] = complexes[i] = nan seen_float = True elif util.is_float_object(val): @@ -596,7 +610,13 @@ def maybe_convert_numeric(object[:] values, set na_values, else: try: status = floatify(val, &fval, &maybe_int) - floats[i] = fval + + if fval in na_values: + floats[i] = complexes[i] = nan + seen_float = True + else: + floats[i] = fval + if not seen_float: if maybe_int: as_int = int(val) @@ -642,6 +662,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, bint seen_float = 0 bint seen_complex = 0 bint seen_datetime = 0 + bint seen_datetimetz = 0 bint seen_timedelta = 0 bint seen_int = 0 bint seen_bool = 0 @@ -675,6 +696,15 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if val is None: seen_null = 1 floats[i] = complexes[i] = fnan + elif val is NaT: + if convert_datetime: + idatetimes[i] = iNaT + seen_datetime = 1 + if convert_timedelta: + itimedeltas[i] = iNaT + seen_timedelta = 1 + if not (convert_datetime or convert_timedelta): + seen_object = 1 elif util.is_bool_object(val): seen_bool = 1 bools[i] = val @@ -710,9 +740,15 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, complexes[i] = val seen_complex = 1 elif PyDateTime_Check(val) or util.is_datetime64_object(val): + + # if we have an tz's attached then return the objects if convert_datetime: - seen_datetime = 1 - idatetimes[i] = convert_to_tsobject(val, None, None, 0, 0).value + if getattr(val, 'tzinfo', None) is not None: + seen_datetimetz = 1 + break + else: + seen_datetime = 1 + idatetimes[i] = convert_to_tsobject(val, None, None, 0, 0).value else: seen_object = 1 break @@ -731,6 +767,13 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, seen_numeric = seen_complex or seen_float or seen_int + # we try to coerce datetime w/tz but must all have the same tz + if seen_datetimetz: + if len(set([ getattr(val, 'tz', None) for val in objects ])) == 1: + from pandas import DatetimeIndex + return DatetimeIndex(objects) + seen_object = 1 + if not seen_object: if not safe: @@ -1103,7 +1146,24 @@ def map_infer(ndarray arr, object f, bint convert=1): return result -def to_object_array(list rows): +def to_object_array(list rows, int min_width=0): + """ + Convert a list of lists into an object array. + + Parameters + ---------- + rows : 2-d array (N, K) + A list of lists to be converted into an array + min_width : int + The minimum width of the object array. If a list + in `rows` contains fewer than `width` elements, + the remaining elements in the corresponding row + will all be `NaN`. + + Returns + ------- + obj_array : numpy array of the object dtype + """ cdef: Py_ssize_t i, j, n, k, tmp ndarray[object, ndim=2] result @@ -1111,7 +1171,7 @@ def to_object_array(list rows): n = len(rows) - k = 0 + k = min_width for i from 0 <= i < n: tmp = len(rows[i]) if tmp > k: @@ -1194,3 +1254,74 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan): output[i] = default return maybe_convert_objects(output) + + +def downcast_int64(ndarray[int64_t] arr, object na_values, + bint use_unsigned=0): + cdef: + Py_ssize_t i, n = len(arr) + int64_t mx = INT64_MIN + 1, mn = INT64_MAX + int64_t NA = na_values[np.int64] + int64_t val + ndarray[uint8_t] mask + int na_count = 0 + + _mask = np.empty(n, dtype=bool) + mask = _mask.view(np.uint8) + + for i in range(n): + val = arr[i] + + if val == NA: + mask[i] = 1 + na_count += 1 + continue + + # not NA + mask[i] = 0 + + if val > mx: + mx = val + + if val < mn: + mn = val + + if mn >= 0 and use_unsigned: + if mx <= UINT8_MAX - 1: + result = arr.astype(np.uint8) + if na_count: + np.putmask(result, _mask, na_values[np.uint8]) + return result + + if mx <= UINT16_MAX - 1: + result = arr.astype(np.uint16) + if na_count: + np.putmask(result, _mask, na_values[np.uint16]) + return result + + if mx <= UINT32_MAX - 1: + result = arr.astype(np.uint32) + if na_count: + np.putmask(result, _mask, na_values[np.uint32]) + return result + + else: + if mn >= INT8_MIN + 1 and mx <= INT8_MAX: + result = arr.astype(np.int8) + if na_count: + np.putmask(result, _mask, na_values[np.int8]) + return result + + if mn >= INT16_MIN + 1 and mx <= INT16_MAX: + result = arr.astype(np.int16) + if na_count: + np.putmask(result, _mask, na_values[np.int16]) + return result + + if mn >= INT32_MIN + 1 and mx <= INT32_MAX: + result = arr.astype(np.int32) + if na_count: + np.putmask(result, _mask, na_values[np.int32]) + return result + + return arr diff --git a/pandas/src/parse_helper.h b/pandas/src/parse_helper.h index d47e448700029..fd5089dd8963d 100644 --- a/pandas/src/parse_helper.h +++ b/pandas/src/parse_helper.h @@ -1,5 +1,6 @@ #include #include +#include "headers/portable.h" static double xstrtod(const char *p, char **q, char decimal, char sci, int skip_trailing, int *maybe_int); @@ -39,22 +40,36 @@ int floatify(PyObject* str, double *result, int *maybe_int) { if (!status) { /* handle inf/-inf */ - if (0 == strcmp(data, "-inf")) { - *result = -HUGE_VAL; - *maybe_int = 0; - } else if (0 == strcmp(data, "inf")) { - *result = HUGE_VAL; - *maybe_int = 0; + if (strlen(data) == 3) { + if (0 == strcasecmp(data, "inf")) { + *result = HUGE_VAL; + *maybe_int = 0; + } else { + goto parsingerror; + } + } else if (strlen(data) == 4) { + if (0 == strcasecmp(data, "-inf")) { + *result = -HUGE_VAL; + *maybe_int = 0; + } else if (0 == strcasecmp(data, "+inf")) { + *result = HUGE_VAL; + *maybe_int = 0; + } else { + goto parsingerror; + } } else { - PyErr_SetString(PyExc_ValueError, "Unable to parse string"); - Py_XDECREF(tmp); - return -1; + goto parsingerror; } } Py_XDECREF(tmp); return 0; +parsingerror: + PyErr_SetString(PyExc_ValueError, "Unable to parse string"); + Py_XDECREF(tmp); + return -1; + /* #if PY_VERSION_HEX >= 0x03000000 return PyFloat_FromString(str); diff --git a/pandas/src/period.pyx b/pandas/src/period.pyx index 0cb0b575b25dc..858aa58df8d7d 100644 --- a/pandas/src/period.pyx +++ b/pandas/src/period.pyx @@ -772,6 +772,9 @@ cdef class Period(object): if self.ordinal == tslib.iNaT or other.ordinal == tslib.iNaT: return _nat_scalar_rules[op] return PyObject_RichCompareBool(self.ordinal, other.ordinal, op) + # index/series like + elif hasattr(other, '_typ'): + return NotImplemented else: if op == Py_EQ: return NotImplemented @@ -796,8 +799,8 @@ cdef class Period(object): else: ordinal = self.ordinal + (nanos // offset_nanos) return Period(ordinal=ordinal, freq=self.freq) - msg = 'Input cannnot be converted to Period(freq={0})' - raise ValueError(msg) + msg = 'Input cannot be converted to Period(freq={0})' + raise IncompatibleFrequency(msg.format(self.freqstr)) elif isinstance(other, offsets.DateOffset): freqstr = frequencies.get_standard_freq(other) base = frequencies.get_base_alias(freqstr) @@ -846,8 +849,8 @@ cdef class Period(object): return Period(ordinal=ordinal, freq=self.freq) elif isinstance(other, Period): if other.freq != self.freq: - raise ValueError("Cannot do arithmetic with " - "non-conforming periods") + msg = _DIFFERENT_FREQ.format(self.freqstr, other.freqstr) + raise IncompatibleFrequency(msg) if self.ordinal == tslib.iNaT or other.ordinal == tslib.iNaT: return Period(ordinal=tslib.iNaT, freq=self.freq) return self.ordinal - other.ordinal @@ -862,7 +865,6 @@ cdef class Period(object): else: return NotImplemented - def asfreq(self, freq, how='E'): """ Convert Period to desired frequency, either at the start or end of the diff --git a/pandas/src/testing.pyx b/pandas/src/testing.pyx index 9f102ded597fd..6780cf311c244 100644 --- a/pandas/src/testing.pyx +++ b/pandas/src/testing.pyx @@ -55,7 +55,9 @@ cpdef assert_dict_equal(a, b, bint compare_keys=True): return True -cpdef assert_almost_equal(a, b, bint check_less_precise=False, check_dtype=True, +cpdef assert_almost_equal(a, b, + check_less_precise=False, + bint check_dtype=True, obj=None, lobj=None, robj=None): """Check that left and right objects are almost equal. @@ -63,9 +65,10 @@ cpdef assert_almost_equal(a, b, bint check_less_precise=False, check_dtype=True, ---------- a : object b : object - check_less_precise : bool, default False + check_less_precise : bool or int, default False Specify comparison precision. 5 digits (False) or 3 digits (True) after decimal points are compared. + If an integer, then this will be the number of decimal points to compare check_dtype: bool, default True check dtype if both a and b are np.ndarray obj : str, default None @@ -91,6 +94,8 @@ cpdef assert_almost_equal(a, b, bint check_less_precise=False, check_dtype=True, if robj is None: robj = b + assert isinstance(check_less_precise, (int, bool)) + if isinstance(a, dict) or isinstance(b, dict): return assert_dict_equal(a, b) @@ -145,7 +150,7 @@ cpdef assert_almost_equal(a, b, bint check_less_precise=False, check_dtype=True, for i in xrange(len(a)): try: - assert_almost_equal(a[i], b[i], check_less_precise) + assert_almost_equal(a[i], b[i], check_less_precise=check_less_precise) except AssertionError: is_unequal = True diff += 1 @@ -173,11 +178,12 @@ cpdef assert_almost_equal(a, b, bint check_less_precise=False, check_dtype=True, # inf comparison return True - decimal = 5 - - # deal with differing dtypes - if check_less_precise: + if check_less_precise is True: decimal = 3 + elif check_less_precise is False: + decimal = 5 + else: + decimal = check_less_precise fa, fb = a, b diff --git a/pandas/stats/tests/test_fama_macbeth.py b/pandas/stats/tests/test_fama_macbeth.py index 2c69eb64fd61d..706becfa730c4 100644 --- a/pandas/stats/tests/test_fama_macbeth.py +++ b/pandas/stats/tests/test_fama_macbeth.py @@ -50,7 +50,9 @@ def checkFamaMacBethExtended(self, window_type, x, y, **kwds): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): reference = fama_macbeth(y=y2, x=x2, **kwds) - assert_almost_equal(reference._stats, result._stats[:, i]) + # reference._stats is tuple + assert_almost_equal(reference._stats, result._stats[:, i], + check_dtype=False) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): static = fama_macbeth(y=y2, x=x2, **kwds) diff --git a/pandas/stats/tests/test_ols.py b/pandas/stats/tests/test_ols.py index 725a4e8296dd2..bac824f0b4840 100644 --- a/pandas/stats/tests/test_ols.py +++ b/pandas/stats/tests/test_ols.py @@ -13,7 +13,6 @@ from distutils.version import LooseVersion import nose import numpy as np -from numpy.testing.decorators import slow from pandas import date_range, bdate_range from pandas.core.panel import Panel @@ -22,7 +21,7 @@ from pandas.stats.ols import _filter_data from pandas.stats.plm import NonPooledPanelOLS, PanelOLS from pandas.util.testing import (assert_almost_equal, assert_series_equal, - assert_frame_equal, assertRaisesRegexp) + assert_frame_equal, assertRaisesRegexp, slow) import pandas.util.testing as tm import pandas.compat as compat from .common import BaseTest @@ -379,7 +378,7 @@ def test_predict_longer_exog(self): model = ols(y=endog, x=exog) pred = model.y_predict - self.assertTrue(pred.index.equals(exog.index)) + self.assert_index_equal(pred.index, exog.index) def test_longpanel_series_combo(self): wp = tm.makePanel() @@ -528,13 +527,12 @@ def testFiltering(self): index = x.index.get_level_values(0) index = Index(sorted(set(index))) exp_index = Index([datetime(2000, 1, 1), datetime(2000, 1, 3)]) - self.assertTrue - (exp_index.equals(index)) + self.assert_index_equal(exp_index, index) index = x.index.get_level_values(1) index = Index(sorted(set(index))) exp_index = Index(['A', 'B']) - self.assertTrue(exp_index.equals(index)) + self.assert_index_equal(exp_index, index) x = result._x_filtered index = x.index.get_level_values(0) @@ -542,24 +540,22 @@ def testFiltering(self): exp_index = Index([datetime(2000, 1, 1), datetime(2000, 1, 3), datetime(2000, 1, 4)]) - self.assertTrue(exp_index.equals(index)) + self.assert_index_equal(exp_index, index) - assert_almost_equal(result._y.values.flat, [1, 4, 5]) + # .flat is flatiter instance + assert_almost_equal(result._y.values.flat, [1, 4, 5], + check_dtype=False) - exp_x = [[6, 14, 1], - [9, 17, 1], - [30, 48, 1]] + exp_x = np.array([[6, 14, 1], [9, 17, 1], + [30, 48, 1]], dtype=np.float64) assert_almost_equal(exp_x, result._x.values) - exp_x_filtered = [[6, 14, 1], - [9, 17, 1], - [30, 48, 1], - [11, 20, 1], - [12, 21, 1]] + exp_x_filtered = np.array([[6, 14, 1], [9, 17, 1], [30, 48, 1], + [11, 20, 1], [12, 21, 1]], dtype=np.float64) assert_almost_equal(exp_x_filtered, result._x_filtered.values) - self.assertTrue(result._x_filtered.index.levels[0].equals( - result.y_fitted.index)) + self.assert_index_equal(result._x_filtered.index.levels[0], + result.y_fitted.index) def test_wls_panel(self): y = tm.makeTimeDataFrame() @@ -598,9 +594,11 @@ def testWithTimeEffects(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = ols(y=self.panel_y2, x=self.panel_x2, time_effects=True) - assert_almost_equal(result._y_trans.values.flat, [0, -0.5, 0.5]) + # .flat is flatiter instance + assert_almost_equal(result._y_trans.values.flat, [0, -0.5, 0.5], + check_dtype=False) - exp_x = [[0, 0], [-10.5, -15.5], [10.5, 15.5]] + exp_x = np.array([[0, 0], [-10.5, -15.5], [10.5, 15.5]]) assert_almost_equal(result._x_trans.values, exp_x) # _check_non_raw_results(result) @@ -609,7 +607,9 @@ def testWithEntityEffects(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = ols(y=self.panel_y2, x=self.panel_x2, entity_effects=True) - assert_almost_equal(result._y.values.flat, [1, 4, 5]) + # .flat is flatiter instance + assert_almost_equal(result._y.values.flat, [1, 4, 5], + check_dtype=False) exp_x = DataFrame([[0., 6., 14., 1.], [0, 9, 17, 1], [1, 30, 48, 1]], index=result._x.index, columns=['FE_B', 'x1', 'x2', @@ -623,7 +623,9 @@ def testWithEntityEffectsAndDroppedDummies(self): result = ols(y=self.panel_y2, x=self.panel_x2, entity_effects=True, dropped_dummies={'entity': 'B'}) - assert_almost_equal(result._y.values.flat, [1, 4, 5]) + # .flat is flatiter instance + assert_almost_equal(result._y.values.flat, [1, 4, 5], + check_dtype=False) exp_x = DataFrame([[1., 6., 14., 1.], [1, 9, 17, 1], [0, 30, 48, 1]], index=result._x.index, columns=['FE_A', 'x1', 'x2', 'intercept'], @@ -635,7 +637,9 @@ def testWithXEffects(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = ols(y=self.panel_y2, x=self.panel_x2, x_effects=['x1']) - assert_almost_equal(result._y.values.flat, [1, 4, 5]) + # .flat is flatiter instance + assert_almost_equal(result._y.values.flat, [1, 4, 5], + check_dtype=False) res = result._x exp_x = DataFrame([[0., 0., 14., 1.], [0, 1, 17, 1], [1, 0, 48, 1]], @@ -649,7 +653,9 @@ def testWithXEffectsAndDroppedDummies(self): dropped_dummies={'x1': 30}) res = result._x - assert_almost_equal(result._y.values.flat, [1, 4, 5]) + # .flat is flatiter instance + assert_almost_equal(result._y.values.flat, [1, 4, 5], + check_dtype=False) exp_x = DataFrame([[1., 0., 14., 1.], [0, 1, 17, 1], [0, 0, 48, 1]], columns=['x1_6', 'x1_9', 'x2', 'intercept'], index=res.index, dtype=float) @@ -661,13 +667,15 @@ def testWithXEffectsAndConversion(self): result = ols(y=self.panel_y3, x=self.panel_x3, x_effects=['x1', 'x2']) - assert_almost_equal(result._y.values.flat, [1, 2, 3, 4]) - exp_x = [[0, 0, 0, 1, 1], [1, 0, 0, 0, 1], [0, 1, 1, 0, 1], - [0, 0, 0, 1, 1]] + # .flat is flatiter instance + assert_almost_equal(result._y.values.flat, [1, 2, 3, 4], + check_dtype=False) + exp_x = np.array([[0, 0, 0, 1, 1], [1, 0, 0, 0, 1], [0, 1, 1, 0, 1], + [0, 0, 0, 1, 1]], dtype=np.float64) assert_almost_equal(result._x.values, exp_x) exp_index = Index(['x1_B', 'x1_C', 'x2_baz', 'x2_foo', 'intercept']) - self.assertTrue(exp_index.equals(result._x.columns)) + self.assert_index_equal(exp_index, result._x.columns) # _check_non_raw_results(result) @@ -675,14 +683,15 @@ def testWithXEffectsAndConversionAndDroppedDummies(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = ols(y=self.panel_y3, x=self.panel_x3, x_effects=['x1', 'x2'], dropped_dummies={'x2': 'foo'}) - - assert_almost_equal(result._y.values.flat, [1, 2, 3, 4]) - exp_x = [[0, 0, 0, 0, 1], [1, 0, 1, 0, 1], [0, 1, 0, 1, 1], - [0, 0, 0, 0, 1]] + # .flat is flatiter instance + assert_almost_equal(result._y.values.flat, [1, 2, 3, 4], + check_dtype=False) + exp_x = np.array([[0, 0, 0, 0, 1], [1, 0, 1, 0, 1], [0, 1, 0, 1, 1], + [0, 0, 0, 0, 1]], dtype=np.float64) assert_almost_equal(result._x.values, exp_x) exp_index = Index(['x1_B', 'x1_C', 'x2_bar', 'x2_baz', 'intercept']) - self.assertTrue(exp_index.equals(result._x.columns)) + self.assert_index_equal(exp_index, result._x.columns) # _check_non_raw_results(result) @@ -915,16 +924,21 @@ def setUp(self): def testFilterWithSeriesRHS(self): (lhs, rhs, weights, rhs_pre, index, valid) = _filter_data(self.TS1, {'x1': self.TS2}, None) - self.tsAssertEqual(self.TS1, lhs) - self.tsAssertEqual(self.TS2[:3], rhs['x1']) - self.tsAssertEqual(self.TS2, rhs_pre['x1']) + self.tsAssertEqual(self.TS1.astype(np.float64), lhs, check_names=False) + self.tsAssertEqual(self.TS2[:3].astype(np.float64), rhs['x1'], + check_names=False) + self.tsAssertEqual(self.TS2.astype(np.float64), rhs_pre['x1'], + check_names=False) def testFilterWithSeriesRHS2(self): (lhs, rhs, weights, rhs_pre, index, valid) = _filter_data(self.TS2, {'x1': self.TS1}, None) - self.tsAssertEqual(self.TS2[:3], lhs) - self.tsAssertEqual(self.TS1, rhs['x1']) - self.tsAssertEqual(self.TS1, rhs_pre['x1']) + self.tsAssertEqual(self.TS2[:3].astype(np.float64), lhs, + check_names=False) + self.tsAssertEqual(self.TS1.astype(np.float64), rhs['x1'], + check_names=False) + self.tsAssertEqual(self.TS1.astype(np.float64), rhs_pre['x1'], + check_names=False) def testFilterWithSeriesRHS3(self): (lhs, rhs, weights, rhs_pre, @@ -932,32 +946,32 @@ def testFilterWithSeriesRHS3(self): exp_lhs = self.TS3[2:3] exp_rhs = self.TS4[2:3] exp_rhs_pre = self.TS4[1:] - self.tsAssertEqual(exp_lhs, lhs) - self.tsAssertEqual(exp_rhs, rhs['x1']) - self.tsAssertEqual(exp_rhs_pre, rhs_pre['x1']) + self.tsAssertEqual(exp_lhs, lhs, check_names=False) + self.tsAssertEqual(exp_rhs, rhs['x1'], check_names=False) + self.tsAssertEqual(exp_rhs_pre, rhs_pre['x1'], check_names=False) def testFilterWithDataFrameRHS(self): (lhs, rhs, weights, rhs_pre, index, valid) = _filter_data(self.TS1, self.DF1, None) - exp_lhs = self.TS1[1:] + exp_lhs = self.TS1[1:].astype(np.float64) exp_rhs1 = self.TS2[1:3] - exp_rhs2 = self.TS4[1:3] - self.tsAssertEqual(exp_lhs, lhs) - self.tsAssertEqual(exp_rhs1, rhs['x1']) - self.tsAssertEqual(exp_rhs2, rhs['x2']) + exp_rhs2 = self.TS4[1:3].astype(np.float64) + self.tsAssertEqual(exp_lhs, lhs, check_names=False) + self.tsAssertEqual(exp_rhs1, rhs['x1'], check_names=False) + self.tsAssertEqual(exp_rhs2, rhs['x2'], check_names=False) def testFilterWithDictRHS(self): (lhs, rhs, weights, rhs_pre, index, valid) = _filter_data(self.TS1, self.DICT1, None) - exp_lhs = self.TS1[1:] - exp_rhs1 = self.TS2[1:3] - exp_rhs2 = self.TS4[1:3] - self.tsAssertEqual(exp_lhs, lhs) - self.tsAssertEqual(exp_rhs1, rhs['x1']) - self.tsAssertEqual(exp_rhs2, rhs['x2']) - - def tsAssertEqual(self, ts1, ts2): - self.assert_numpy_array_equal(ts1, ts2) + exp_lhs = self.TS1[1:].astype(np.float64) + exp_rhs1 = self.TS2[1:3].astype(np.float64) + exp_rhs2 = self.TS4[1:3].astype(np.float64) + self.tsAssertEqual(exp_lhs, lhs, check_names=False) + self.tsAssertEqual(exp_rhs1, rhs['x1'], check_names=False) + self.tsAssertEqual(exp_rhs2, rhs['x2'], check_names=False) + + def tsAssertEqual(self, ts1, ts2, **kwargs): + self.assert_series_equal(ts1, ts2, **kwargs) if __name__ == '__main__': diff --git a/pandas/stats/tests/test_var.py b/pandas/stats/tests/test_var.py index 9bcd070dc1d33..9f2c95a2d3d5c 100644 --- a/pandas/stats/tests/test_var.py +++ b/pandas/stats/tests/test_var.py @@ -1,9 +1,8 @@ # flake8: noqa from __future__ import print_function -from numpy.testing import run_module_suite, assert_equal, TestCase -from pandas.util.testing import assert_almost_equal +import pandas.util.testing as tm from pandas.compat import range import nose @@ -33,53 +32,56 @@ class CheckVAR(object): def test_params(self): - assert_almost_equal(self.res1.params, self.res2.params, DECIMAL_3) + tm.assert_almost_equal(self.res1.params, self.res2.params, DECIMAL_3) def test_neqs(self): - assert_equal(self.res1.neqs, self.res2.neqs) + tm.assert_numpy_array_equal(self.res1.neqs, self.res2.neqs) def test_nobs(self): - assert_equal(self.res1.avobs, self.res2.nobs) + tm.assert_numpy_array_equal(self.res1.avobs, self.res2.nobs) def test_df_eq(self): - assert_equal(self.res1.df_eq, self.res2.df_eq) + tm.assert_numpy_array_equal(self.res1.df_eq, self.res2.df_eq) def test_rmse(self): results = self.res1.results for i in range(len(results)): - assert_almost_equal(results[i].mse_resid ** .5, - eval('self.res2.rmse_' + str(i + 1)), DECIMAL_6) + tm.assert_almost_equal(results[i].mse_resid ** .5, + eval('self.res2.rmse_' + str(i + 1)), + DECIMAL_6) def test_rsquared(self): results = self.res1.results for i in range(len(results)): - assert_almost_equal(results[i].rsquared, - eval('self.res2.rsquared_' + str(i + 1)), DECIMAL_3) + tm.assert_almost_equal(results[i].rsquared, + eval('self.res2.rsquared_' + str(i + 1)), + DECIMAL_3) def test_llf(self): results = self.res1.results - assert_almost_equal(self.res1.llf, self.res2.llf, DECIMAL_2) + tm.assert_almost_equal(self.res1.llf, self.res2.llf, DECIMAL_2) for i in range(len(results)): - assert_almost_equal(results[i].llf, - eval('self.res2.llf_' + str(i + 1)), DECIMAL_2) + tm.assert_almost_equal(results[i].llf, + eval('self.res2.llf_' + str(i + 1)), + DECIMAL_2) def test_aic(self): - assert_almost_equal(self.res1.aic, self.res2.aic) + tm.assert_almost_equal(self.res1.aic, self.res2.aic) def test_bic(self): - assert_almost_equal(self.res1.bic, self.res2.bic) + tm.assert_almost_equal(self.res1.bic, self.res2.bic) def test_hqic(self): - assert_almost_equal(self.res1.hqic, self.res2.hqic) + tm.assert_almost_equal(self.res1.hqic, self.res2.hqic) def test_fpe(self): - assert_almost_equal(self.res1.fpe, self.res2.fpe) + tm.assert_almost_equal(self.res1.fpe, self.res2.fpe) def test_detsig(self): - assert_almost_equal(self.res1.detomega, self.res2.detsig) + tm.assert_almost_equal(self.res1.detomega, self.res2.detsig) def test_bse(self): - assert_almost_equal(self.res1.bse, self.res2.bse, DECIMAL_4) + tm.assert_almost_equal(self.res1.bse, self.res2.bse, DECIMAL_4) class Foo(object): diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index 4fcee32c46067..e67fe2cddde77 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -3087,11 +3087,11 @@ def test_to_csv_doublequote(self): def test_to_csv_escapechar(self): df = DataFrame({'col': ['a"a', '"bb"']}) - expected = """\ + expected = '''\ "","col" "0","a\\"a" "1","\\"bb\\"" -""" +''' with tm.ensure_clean('test.csv') as path: # QUOTE_ALL df.to_csv(path, quoting=1, doublequote=False, escapechar='\\') @@ -3758,25 +3758,6 @@ def test_to_string_header(self): exp = '0 0\n ..\n9 9' self.assertEqual(res, exp) - def test_sparse_max_row(self): - s = pd.Series([1, np.nan, np.nan, 3, np.nan]).to_sparse() - result = repr(s) - dtype = '' if use_32bit_repr else ', dtype=int32' - exp = ("0 1.0\n1 NaN\n2 NaN\n3 3.0\n" - "4 NaN\ndtype: float64\nBlockIndex\n" - "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dtype)) - self.assertEqual(result, exp) - - with option_context("display.max_rows", 3): - # GH 10560 - result = repr(s) - exp = ("0 1.0\n ... \n4 NaN\n" - "dtype: float64\nBlockIndex\n" - "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dtype)) - self.assertEqual(result, exp) - class TestEngFormatter(tm.TestCase): _multiprocess_can_split_ = True @@ -3925,6 +3906,21 @@ def test_rounding(self): result = formatter(0) self.assertEqual(result, u(' 0.000')) + def test_nan(self): + # Issue #11981 + + formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) + result = formatter(np.nan) + self.assertEqual(result, u('NaN')) + + df = pd.DataFrame({'a':[1.5, 10.3, 20.5], + 'b':[50.3, 60.67, 70.12], + 'c':[100.2, 101.33, 120.33]}) + pt = df.pivot_table(values='a', index='b', columns='c') + fmt.set_eng_float_format(accuracy=1) + result = pt.to_string() + self.assertTrue('NaN' in result) + self.reset_display_options() def _three_digit_exp(): return '%.4g' % 1.7e8 == '1.7e+008' @@ -4268,6 +4264,21 @@ def test_nat_representations(self): self.assertEqual(f(pd.NaT), 'NaT') +def test_format_percentiles(): + result = fmt.format_percentiles([0.01999, 0.02001, 0.5, 0.666666, 0.9999]) + expected = ['1.999%', '2.001%', '50%', '66.667%', '99.99%'] + tm.assert_equal(result, expected) + + result = fmt.format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999]) + expected = ['0%', '50%', '2.0%', '50%', '66.67%', '99.99%'] + tm.assert_equal(result, expected) + + tm.assertRaises(ValueError, fmt.format_percentiles, [0.1, np.nan, 0.5]) + tm.assertRaises(ValueError, fmt.format_percentiles, [-0.001, 0.1, 0.5]) + tm.assertRaises(ValueError, fmt.format_percentiles, [2, 0.1, 0.5]) + tm.assertRaises(ValueError, fmt.format_percentiles, [0.1, 0.5, 'a']) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 1da5487aefc01..3b50dd2c1d49f 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -330,28 +330,30 @@ def test_rename(self): # gets sorted alphabetical df = DataFrame(data) renamed = df.rename(index={'foo': 'bar', 'bar': 'foo'}) - self.assert_numpy_array_equal(renamed.index, ['foo', 'bar']) + tm.assert_index_equal(renamed.index, pd.Index(['foo', 'bar'])) renamed = df.rename(index=str.upper) - self.assert_numpy_array_equal(renamed.index, ['BAR', 'FOO']) + tm.assert_index_equal(renamed.index, pd.Index(['BAR', 'FOO'])) # have to pass something self.assertRaises(TypeError, self.frame.rename) # partial columns renamed = self.frame.rename(columns={'C': 'foo', 'D': 'bar'}) - self.assert_numpy_array_equal( - renamed.columns, ['A', 'B', 'foo', 'bar']) + tm.assert_index_equal(renamed.columns, + pd.Index(['A', 'B', 'foo', 'bar'])) # other axis renamed = self.frame.T.rename(index={'C': 'foo', 'D': 'bar'}) - self.assert_numpy_array_equal(renamed.index, ['A', 'B', 'foo', 'bar']) + tm.assert_index_equal(renamed.index, + pd.Index(['A', 'B', 'foo', 'bar'])) # index with name index = Index(['foo', 'bar'], name='name') renamer = DataFrame(data, index=index) renamed = renamer.rename(index={'foo': 'bar', 'bar': 'foo'}) - self.assert_numpy_array_equal(renamed.index, ['bar', 'foo']) + tm.assert_index_equal(renamed.index, + pd.Index(['bar', 'foo'], name='name')) self.assertEqual(renamed.index.name, renamer.index.name) # MultiIndex @@ -363,12 +365,14 @@ def test_rename(self): renamer = DataFrame([(0, 0), (1, 1)], index=index, columns=columns) renamed = renamer.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}) - new_index = MultiIndex.from_tuples( - [('foo3', 'bar1'), ('foo2', 'bar3')]) - new_columns = MultiIndex.from_tuples( - [('fizz3', 'buzz1'), ('fizz2', 'buzz3')]) - self.assert_numpy_array_equal(renamed.index, new_index) - self.assert_numpy_array_equal(renamed.columns, new_columns) + new_index = MultiIndex.from_tuples([('foo3', 'bar1'), + ('foo2', 'bar3')], + names=['foo', 'bar']) + new_columns = MultiIndex.from_tuples([('fizz3', 'buzz1'), + ('fizz2', 'buzz3')], + names=['fizz', 'buzz']) + self.assert_index_equal(renamed.index, new_index) + self.assert_index_equal(renamed.columns, new_columns) self.assertEqual(renamed.index.names, renamer.index.names) self.assertEqual(renamed.columns.names, renamer.columns.names) @@ -460,28 +464,30 @@ def test_reset_index(self): stacked.index.names = [None, None] deleveled2 = stacked.reset_index() - self.assert_numpy_array_equal(deleveled['first'], - deleveled2['level_0']) - self.assert_numpy_array_equal(deleveled['second'], - deleveled2['level_1']) + tm.assert_series_equal(deleveled['first'], deleveled2['level_0'], + check_names=False) + tm.assert_series_equal(deleveled['second'], deleveled2['level_1'], + check_names=False) # default name assigned rdf = self.frame.reset_index() - self.assert_numpy_array_equal(rdf['index'], self.frame.index.values) + exp = pd.Series(self.frame.index.values, name='index') + self.assert_series_equal(rdf['index'], exp) # default name assigned, corner case df = self.frame.copy() df['index'] = 'foo' rdf = df.reset_index() - self.assert_numpy_array_equal(rdf['level_0'], self.frame.index.values) + exp = pd.Series(self.frame.index.values, name='level_0') + self.assert_series_equal(rdf['level_0'], exp) # but this is ok self.frame.index.name = 'index' deleveled = self.frame.reset_index() - self.assert_numpy_array_equal(deleveled['index'], - self.frame.index.values) - self.assert_numpy_array_equal(deleveled.index, - np.arange(len(deleveled))) + self.assert_series_equal(deleveled['index'], + pd.Series(self.frame.index)) + self.assert_index_equal(deleveled.index, + pd.Index(np.arange(len(deleveled)))) # preserve column names self.frame.columns.name = 'columns' diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 20aaae586f14f..b71235a8f6576 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -18,12 +18,6 @@ import pandas.core.nanops as nanops import pandas.formats.printing as printing -from pandas.util.testing import (assert_almost_equal, - assert_equal, - assert_series_equal, - assert_frame_equal, - assertRaisesRegexp) - import pandas.util.testing as tm from pandas.tests.frame.common import TestData @@ -60,12 +54,12 @@ def _check_method(self, method='pearson', check_minp=False): if not check_minp: correls = self.frame.corr(method=method) exp = self.frame['A'].corr(self.frame['C'], method=method) - assert_almost_equal(correls['A']['C'], exp) + tm.assert_almost_equal(correls['A']['C'], exp) else: result = self.frame.corr(min_periods=len(self.frame) - 8) expected = self.frame.corr() expected.ix['A', 'B'] = expected.ix['B', 'A'] = nan - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_corr_non_numeric(self): tm._skip_if_no_scipy() @@ -75,7 +69,7 @@ def test_corr_non_numeric(self): # exclude non-numeric types result = self.mixed_frame.corr() expected = self.mixed_frame.ix[:, ['A', 'B', 'C', 'D']].corr() - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_corr_nooverlap(self): tm._skip_if_no_scipy() @@ -123,14 +117,14 @@ def test_corr_int_and_boolean(self): expected = DataFrame(np.ones((2, 2)), index=[ 'a', 'b'], columns=['a', 'b']) for meth in ['pearson', 'kendall', 'spearman']: - assert_frame_equal(df.corr(meth), expected) + tm.assert_frame_equal(df.corr(meth), expected) def test_cov(self): # min_periods no NAs (corner case) expected = self.frame.cov() result = self.frame.cov(min_periods=len(self.frame)) - assert_frame_equal(expected, result) + tm.assert_frame_equal(expected, result) result = self.frame.cov(min_periods=len(self.frame) + 1) self.assertTrue(isnull(result.values).all()) @@ -149,25 +143,25 @@ def test_cov(self): self.frame['B'][:10] = nan cov = self.frame.cov() - assert_almost_equal(cov['A']['C'], - self.frame['A'].cov(self.frame['C'])) + tm.assert_almost_equal(cov['A']['C'], + self.frame['A'].cov(self.frame['C'])) # exclude non-numeric types result = self.mixed_frame.cov() expected = self.mixed_frame.ix[:, ['A', 'B', 'C', 'D']].cov() - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # Single column frame df = DataFrame(np.linspace(0.0, 1.0, 10)) result = df.cov() expected = DataFrame(np.cov(df.values.T).reshape((1, 1)), index=df.columns, columns=df.columns) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df.ix[0] = np.nan result = df.cov() expected = DataFrame(np.cov(df.values[1:].T).reshape((1, 1)), index=df.columns, columns=df.columns) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_corrwith(self): a = self.tsframe @@ -180,13 +174,13 @@ def test_corrwith(self): del b['B'] colcorr = a.corrwith(b, axis=0) - assert_almost_equal(colcorr['A'], a['A'].corr(b['A'])) + tm.assert_almost_equal(colcorr['A'], a['A'].corr(b['A'])) rowcorr = a.corrwith(b, axis=1) - assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0)) + tm.assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0)) dropped = a.corrwith(b, axis=0, drop=True) - assert_almost_equal(dropped['A'], a['A'].corr(b['A'])) + tm.assert_almost_equal(dropped['A'], a['A'].corr(b['A'])) self.assertNotIn('B', dropped) dropped = a.corrwith(b, axis=1, drop=True) @@ -199,7 +193,7 @@ def test_corrwith(self): df2 = DataFrame(randn(4, 4), index=index[:4], columns=columns) correls = df1.corrwith(df2, axis=1) for row in index[:4]: - assert_almost_equal(correls[row], df1.ix[row].corr(df2.ix[row])) + tm.assert_almost_equal(correls[row], df1.ix[row].corr(df2.ix[row])) def test_corrwith_with_objects(self): df1 = tm.makeTimeDataFrame() @@ -211,17 +205,17 @@ def test_corrwith_with_objects(self): result = df1.corrwith(df2) expected = df1.ix[:, cols].corrwith(df2.ix[:, cols]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = df1.corrwith(df2, axis=1) expected = df1.ix[:, cols].corrwith(df2.ix[:, cols], axis=1) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_corrwith_series(self): result = self.tsframe.corrwith(self.tsframe['A']) expected = self.tsframe.apply(self.tsframe['A'].corr) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_corrwith_matches_corrcoef(self): df1 = DataFrame(np.arange(10000), columns=['a']) @@ -229,7 +223,7 @@ def test_corrwith_matches_corrcoef(self): c1 = df1.corrwith(df2)['a'] c2 = np.corrcoef(df1['a'], df2['a'])[0][1] - assert_almost_equal(c1, c2) + tm.assert_almost_equal(c1, c2) self.assertTrue(c1 < 1) def test_bool_describe_in_mixed_frame(self): @@ -246,14 +240,14 @@ def test_bool_describe_in_mixed_frame(self): 10, 20, 30, 40, 50]}, index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # Top value is a boolean value that is False result = df.describe(include=['bool']) expected = DataFrame({'bool_data': [5, 2, False, 3]}, index=['count', 'unique', 'top', 'freq']) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_describe_categorical_columns(self): # GH 11558 @@ -310,8 +304,9 @@ def test_reduce_mixed_frame(self): }) df.reindex(columns=['bool_data', 'int_data', 'string_data']) test = df.sum(axis=0) - assert_almost_equal(test.values, [2, 150, 'abcde']) - assert_series_equal(test, df.T.sum(axis=1)) + tm.assert_numpy_array_equal(test.values, + np.array([2, 150, 'abcde'], dtype=object)) + tm.assert_series_equal(test, df.T.sum(axis=1)) def test_count(self): f = lambda s: notnull(s).sum() @@ -333,17 +328,17 @@ def test_count(self): df = DataFrame(index=lrange(10)) result = df.count(1) expected = Series(0, index=df.index) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) df = DataFrame(columns=lrange(10)) result = df.count(0) expected = Series(0, index=df.columns) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) df = DataFrame() result = df.count() expected = Series(0, index=[]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_sum(self): self._check_stat_op('sum', np.sum, has_numeric_only=True) @@ -377,7 +372,7 @@ def test_stat_operators_attempt_obj_array(self): expected = getattr(df.astype('f8'), meth)(1) if not tm._incompat_bottleneck_version(meth): - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_mean(self): self._check_stat_op('mean', np.mean, check_dates=True) @@ -405,12 +400,12 @@ def test_cummin(self): # axis = 0 cummin = self.tsframe.cummin() expected = self.tsframe.apply(Series.cummin) - assert_frame_equal(cummin, expected) + tm.assert_frame_equal(cummin, expected) # axis = 1 cummin = self.tsframe.cummin(axis=1) expected = self.tsframe.apply(Series.cummin, axis=1) - assert_frame_equal(cummin, expected) + tm.assert_frame_equal(cummin, expected) # it works df = DataFrame({'A': np.arange(20)}, index=np.arange(20)) @@ -428,12 +423,12 @@ def test_cummax(self): # axis = 0 cummax = self.tsframe.cummax() expected = self.tsframe.apply(Series.cummax) - assert_frame_equal(cummax, expected) + tm.assert_frame_equal(cummax, expected) # axis = 1 cummax = self.tsframe.cummax(axis=1) expected = self.tsframe.apply(Series.cummax, axis=1) - assert_frame_equal(cummax, expected) + tm.assert_frame_equal(cummax, expected) # it works df = DataFrame({'A': np.arange(20)}, index=np.arange(20)) @@ -460,11 +455,11 @@ def test_var_std(self): result = self.tsframe.std(ddof=4) expected = self.tsframe.apply(lambda x: x.std(ddof=4)) - assert_almost_equal(result, expected) + tm.assert_almost_equal(result, expected) result = self.tsframe.var(ddof=4) expected = self.tsframe.apply(lambda x: x.var(ddof=4)) - assert_almost_equal(result, expected) + tm.assert_almost_equal(result, expected) arr = np.repeat(np.random.random((1, 1000)), 1000, 0) result = nanops.nanvar(arr, axis=0) @@ -489,11 +484,11 @@ def test_numeric_only_flag(self): for meth in methods: result = getattr(df1, meth)(axis=1, numeric_only=True) expected = getattr(df1[['bar', 'baz']], meth)(axis=1) - assert_series_equal(expected, result) + tm.assert_series_equal(expected, result) result = getattr(df2, meth)(axis=1, numeric_only=True) expected = getattr(df2[['bar', 'baz']], meth)(axis=1) - assert_series_equal(expected, result) + tm.assert_series_equal(expected, result) # df1 has all numbers, df2 has a letter inside self.assertRaises(TypeError, lambda: getattr(df1, meth) @@ -509,12 +504,12 @@ def test_cumsum(self): # axis = 0 cumsum = self.tsframe.cumsum() expected = self.tsframe.apply(Series.cumsum) - assert_frame_equal(cumsum, expected) + tm.assert_frame_equal(cumsum, expected) # axis = 1 cumsum = self.tsframe.cumsum(axis=1) expected = self.tsframe.apply(Series.cumsum, axis=1) - assert_frame_equal(cumsum, expected) + tm.assert_frame_equal(cumsum, expected) # works df = DataFrame({'A': np.arange(20)}, index=np.arange(20)) @@ -532,12 +527,12 @@ def test_cumprod(self): # axis = 0 cumprod = self.tsframe.cumprod() expected = self.tsframe.apply(Series.cumprod) - assert_frame_equal(cumprod, expected) + tm.assert_frame_equal(cumprod, expected) # axis = 1 cumprod = self.tsframe.cumprod(axis=1) expected = self.tsframe.apply(Series.cumprod, axis=1) - assert_frame_equal(cumprod, expected) + tm.assert_frame_equal(cumprod, expected) # fix issue cumprod_xs = self.tsframe.cumprod(axis=1) @@ -574,48 +569,48 @@ def test_rank(self): exp1 = np.apply_along_axis(rankdata, 1, fvals) exp1[mask] = np.nan - assert_almost_equal(ranks0.values, exp0) - assert_almost_equal(ranks1.values, exp1) + tm.assert_almost_equal(ranks0.values, exp0) + tm.assert_almost_equal(ranks1.values, exp1) # integers df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4))) result = df.rank() exp = df.astype(float).rank() - assert_frame_equal(result, exp) + tm.assert_frame_equal(result, exp) result = df.rank(1) exp = df.astype(float).rank(1) - assert_frame_equal(result, exp) + tm.assert_frame_equal(result, exp) def test_rank2(self): df = DataFrame([[1, 3, 2], [1, 2, 3]]) expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0 result = df.rank(1, pct=True) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df = DataFrame([[1, 3, 2], [1, 2, 3]]) expected = df.rank(0) / 2.0 result = df.rank(0, pct=True) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']]) expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]]) result = df.rank(1, numeric_only=False) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]]) result = df.rank(0, numeric_only=False) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df = DataFrame([['b', np.nan, 'a'], ['a', 'c', 'b']]) expected = DataFrame([[2.0, nan, 1.0], [1.0, 3.0, 2.0]]) result = df.rank(1, numeric_only=False) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) expected = DataFrame([[2.0, nan, 1.0], [1.0, 1.0, 2.0]]) result = df.rank(0, numeric_only=False) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # f7u12, this does not work without extensive workaround data = [[datetime(2001, 1, 5), nan, datetime(2001, 1, 2)], @@ -627,12 +622,12 @@ def test_rank2(self): expected = DataFrame([[2., nan, 1.], [2., 3., 1.]]) result = df.rank(1, numeric_only=False, ascending=True) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) expected = DataFrame([[1., nan, 2.], [2., 1., 3.]]) result = df.rank(1, numeric_only=False, ascending=False) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # mixed-type frames self.mixed_frame['datetime'] = datetime.now() @@ -640,12 +635,12 @@ def test_rank2(self): result = self.mixed_frame.rank(1) expected = self.mixed_frame.rank(1, numeric_only=True) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10, 1e60, 1e80, 1e-30]}) exp = DataFrame({"a": [3.5, 1., 3.5, 5., 6., 7., 2.]}) - assert_frame_equal(df.rank(), exp) + tm.assert_frame_equal(df.rank(), exp) def test_rank_na_option(self): tm._skip_if_no_scipy() @@ -665,8 +660,8 @@ def test_rank_na_option(self): exp0 = np.apply_along_axis(rankdata, 0, fvals) exp1 = np.apply_along_axis(rankdata, 1, fvals) - assert_almost_equal(ranks0.values, exp0) - assert_almost_equal(ranks1.values, exp1) + tm.assert_almost_equal(ranks0.values, exp0) + tm.assert_almost_equal(ranks1.values, exp1) # top ranks0 = self.frame.rank(na_option='top') @@ -680,8 +675,8 @@ def test_rank_na_option(self): exp0 = np.apply_along_axis(rankdata, 0, fval0) exp1 = np.apply_along_axis(rankdata, 1, fval1) - assert_almost_equal(ranks0.values, exp0) - assert_almost_equal(ranks1.values, exp1) + tm.assert_almost_equal(ranks0.values, exp0) + tm.assert_almost_equal(ranks1.values, exp1) # descending @@ -694,8 +689,8 @@ def test_rank_na_option(self): exp0 = np.apply_along_axis(rankdata, 0, -fvals) exp1 = np.apply_along_axis(rankdata, 1, -fvals) - assert_almost_equal(ranks0.values, exp0) - assert_almost_equal(ranks1.values, exp1) + tm.assert_almost_equal(ranks0.values, exp0) + tm.assert_almost_equal(ranks1.values, exp1) # descending @@ -711,14 +706,14 @@ def test_rank_na_option(self): exp0 = np.apply_along_axis(rankdata, 0, -fval0) exp1 = np.apply_along_axis(rankdata, 1, -fval1) - assert_almost_equal(ranks0.values, exp0) - assert_almost_equal(ranks1.values, exp1) + tm.assert_numpy_array_equal(ranks0.values, exp0) + tm.assert_numpy_array_equal(ranks1.values, exp1) def test_rank_axis(self): # check if using axes' names gives the same result df = pd.DataFrame([[2, 1], [4, 3]]) - assert_frame_equal(df.rank(axis=0), df.rank(axis='index')) - assert_frame_equal(df.rank(axis=1), df.rank(axis='columns')) + tm.assert_frame_equal(df.rank(axis=0), df.rank(axis='index')) + tm.assert_frame_equal(df.rank(axis=1), df.rank(axis='columns')) def test_sem(self): alt = lambda x: np.std(x, ddof=1) / np.sqrt(len(x)) @@ -727,7 +722,7 @@ def test_sem(self): result = self.tsframe.sem(ddof=4) expected = self.tsframe.apply( lambda x: x.std(ddof=4) / np.sqrt(len(x))) - assert_almost_equal(result, expected) + tm.assert_almost_equal(result, expected) arr = np.repeat(np.random.random((1, 1000)), 1000, 0) result = nanops.nansem(arr, axis=0) @@ -789,7 +784,7 @@ def alt(x): kurt = df.kurt() kurt2 = df.kurt(level=0).xs('bar') - assert_series_equal(kurt, kurt2, check_names=False) + tm.assert_series_equal(kurt, kurt2, check_names=False) self.assertTrue(kurt.name is None) self.assertEqual(kurt2.name, 'bar') @@ -827,26 +822,26 @@ def wrapper(x): result0 = f(axis=0, skipna=False) result1 = f(axis=1, skipna=False) - assert_series_equal(result0, frame.apply(wrapper), - check_dtype=check_dtype, - check_less_precise=check_less_precise) + tm.assert_series_equal(result0, frame.apply(wrapper), + check_dtype=check_dtype, + check_less_precise=check_less_precise) # HACK: win32 - assert_series_equal(result1, frame.apply(wrapper, axis=1), - check_dtype=False, - check_less_precise=check_less_precise) + tm.assert_series_equal(result1, frame.apply(wrapper, axis=1), + check_dtype=False, + check_less_precise=check_less_precise) else: skipna_wrapper = alternative wrapper = alternative result0 = f(axis=0) result1 = f(axis=1) - assert_series_equal(result0, frame.apply(skipna_wrapper), - check_dtype=check_dtype, - check_less_precise=check_less_precise) + tm.assert_series_equal(result0, frame.apply(skipna_wrapper), + check_dtype=check_dtype, + check_less_precise=check_less_precise) if not tm._incompat_bottleneck_version(name): - assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1), - check_dtype=False, - check_less_precise=check_less_precise) + exp = frame.apply(skipna_wrapper, axis=1) + tm.assert_series_equal(result1, exp, check_dtype=False, + check_less_precise=check_less_precise) # check dtypes if check_dtype: @@ -859,7 +854,7 @@ def wrapper(x): # assert_series_equal(result, comp) # bad axis - assertRaisesRegexp(ValueError, 'No axis named 2', f, axis=2) + tm.assertRaisesRegexp(ValueError, 'No axis named 2', f, axis=2) # make sure works on mixed-type frame getattr(self.mixed_frame, name)(axis=0) getattr(self.mixed_frame, name)(axis=1) @@ -885,20 +880,20 @@ def test_mode(self): "C": [8, 8, 8, 9, 9, 9], "D": np.arange(6, dtype='int64'), "E": [8, 8, 1, 1, 3, 3]}) - assert_frame_equal(df[["A"]].mode(), - pd.DataFrame({"A": [12]})) + tm.assert_frame_equal(df[["A"]].mode(), + pd.DataFrame({"A": [12]})) expected = pd.Series([], dtype='int64', name='D').to_frame() - assert_frame_equal(df[["D"]].mode(), expected) + tm.assert_frame_equal(df[["D"]].mode(), expected) expected = pd.Series([1, 3, 8], dtype='int64', name='E').to_frame() - assert_frame_equal(df[["E"]].mode(), expected) - assert_frame_equal(df[["A", "B"]].mode(), - pd.DataFrame({"A": [12], "B": [10.]})) - assert_frame_equal(df.mode(), - pd.DataFrame({"A": [12, np.nan, np.nan], - "B": [10, np.nan, np.nan], - "C": [8, 9, np.nan], - "D": [np.nan, np.nan, np.nan], - "E": [1, 3, 8]})) + tm.assert_frame_equal(df[["E"]].mode(), expected) + tm.assert_frame_equal(df[["A", "B"]].mode(), + pd.DataFrame({"A": [12], "B": [10.]})) + tm.assert_frame_equal(df.mode(), + pd.DataFrame({"A": [12, np.nan, np.nan], + "B": [10, np.nan, np.nan], + "C": [8, 9, np.nan], + "D": [np.nan, np.nan, np.nan], + "E": [1, 3, 8]})) # outputs in sorted order df["C"] = list(reversed(df["C"])) @@ -910,7 +905,7 @@ def test_mode(self): "C": [8, 9]})) printing.pprint_thing(a) printing.pprint_thing(b) - assert_frame_equal(a, b) + tm.assert_frame_equal(a, b) # should work with heterogeneous types df = pd.DataFrame({"A": np.arange(6, dtype='int64'), "B": pd.date_range('2011', periods=6), @@ -918,7 +913,7 @@ def test_mode(self): exp = pd.DataFrame({"A": pd.Series([], dtype=df["A"].dtype), "B": pd.Series([], dtype=df["B"].dtype), "C": pd.Series([], dtype=df["C"].dtype)}) - assert_frame_equal(df.mode(), exp) + tm.assert_frame_equal(df.mode(), exp) # and also when not empty df.loc[1, "A"] = 0 @@ -929,7 +924,7 @@ def test_mode(self): dtype=df["B"].dtype), "C": pd.Series(['e'], dtype=df["C"].dtype)}) - assert_frame_equal(df.mode(), exp) + tm.assert_frame_equal(df.mode(), exp) def test_operators_timedelta64(self): from datetime import timedelta @@ -962,8 +957,8 @@ def test_operators_timedelta64(self): result2 = abs(diffs) expected = DataFrame(dict(A=df['A'] - df['C'], B=df['B'] - df['A'])) - assert_frame_equal(result, expected) - assert_frame_equal(result2, expected) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected) # mixed frame mixed = diffs.copy() @@ -982,22 +977,22 @@ def test_operators_timedelta64(self): 'foo', 1, 1.0, Timestamp('20130101')], index=mixed.columns) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # excludes numeric result = mixed.min(axis=1) expected = Series([1, 1, 1.], index=[0, 1, 2]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # works when only those columns are selected result = mixed[['A', 'B']].min(1) expected = Series([timedelta(days=-1)] * 3) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = mixed[['A', 'B']].min() expected = Series([timedelta(seconds=5 * 60 + 5), timedelta(days=-1)], index=['A', 'B']) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # GH 3106 df = DataFrame({'time': date_range('20130102', periods=5), @@ -1035,13 +1030,13 @@ def test_mean_corner(self): # unit test when have object data the_mean = self.mixed_frame.mean(axis=0) the_sum = self.mixed_frame.sum(axis=0, numeric_only=True) - self.assertTrue(the_sum.index.equals(the_mean.index)) + self.assert_index_equal(the_sum.index, the_mean.index) self.assertTrue(len(the_mean.index) < len(self.mixed_frame.columns)) # xs sum mixed type, just want to know it works... the_mean = self.mixed_frame.mean(axis=1) the_sum = self.mixed_frame.sum(axis=1, numeric_only=True) - self.assertTrue(the_sum.index.equals(the_mean.index)) + self.assert_index_equal(the_sum.index, the_mean.index) # take mean of boolean column self.frame['bool'] = self.frame['A'] > 0 @@ -1070,8 +1065,8 @@ def test_count_objects(self): dm = DataFrame(self.mixed_frame._series) df = DataFrame(self.mixed_frame._series) - assert_series_equal(dm.count(), df.count()) - assert_series_equal(dm.count(1), df.count(1)) + tm.assert_series_equal(dm.count(), df.count()) + tm.assert_series_equal(dm.count(1), df.count(1)) def test_cumsum_corner(self): dm = DataFrame(np.arange(20).reshape(4, 5), @@ -1094,9 +1089,9 @@ def test_idxmin(self): for axis in [0, 1]: for df in [frame, self.intframe]: result = df.idxmin(axis=axis, skipna=skipna) - expected = df.apply( - Series.idxmin, axis=axis, skipna=skipna) - assert_series_equal(result, expected) + expected = df.apply(Series.idxmin, axis=axis, + skipna=skipna) + tm.assert_series_equal(result, expected) self.assertRaises(ValueError, frame.idxmin, axis=2) @@ -1108,9 +1103,9 @@ def test_idxmax(self): for axis in [0, 1]: for df in [frame, self.intframe]: result = df.idxmax(axis=axis, skipna=skipna) - expected = df.apply( - Series.idxmax, axis=axis, skipna=skipna) - assert_series_equal(result, expected) + expected = df.apply(Series.idxmax, axis=axis, + skipna=skipna) + tm.assert_series_equal(result, expected) self.assertRaises(ValueError, frame.idxmax, axis=2) @@ -1169,18 +1164,18 @@ def wrapper(x): result0 = f(axis=0, skipna=False) result1 = f(axis=1, skipna=False) - assert_series_equal(result0, frame.apply(wrapper)) - assert_series_equal(result1, frame.apply(wrapper, axis=1), - check_dtype=False) # HACK: win32 + tm.assert_series_equal(result0, frame.apply(wrapper)) + tm.assert_series_equal(result1, frame.apply(wrapper, axis=1), + check_dtype=False) # HACK: win32 else: skipna_wrapper = alternative wrapper = alternative result0 = f(axis=0) result1 = f(axis=1) - assert_series_equal(result0, frame.apply(skipna_wrapper)) - assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1), - check_dtype=False) + tm.assert_series_equal(result0, frame.apply(skipna_wrapper)) + tm.assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1), + check_dtype=False) # result = f(axis=1) # comp = frame.apply(alternative, axis=1).reindex(result.index) @@ -1230,7 +1225,7 @@ def test_nlargest(self): 'b': list(ascii_lowercase[:10])}) result = df.nlargest(5, 'a') expected = df.sort_values('a', ascending=False).head(5) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_nlargest_multiple_columns(self): from string import ascii_lowercase @@ -1239,7 +1234,7 @@ def test_nlargest_multiple_columns(self): 'c': np.random.permutation(10).astype('float64')}) result = df.nlargest(5, ['a', 'b']) expected = df.sort_values(['a', 'b'], ascending=False).head(5) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_nsmallest(self): from string import ascii_lowercase @@ -1247,7 +1242,7 @@ def test_nsmallest(self): 'b': list(ascii_lowercase[:10])}) result = df.nsmallest(5, 'a') expected = df.sort_values('a').head(5) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_nsmallest_multiple_columns(self): from string import ascii_lowercase @@ -1256,7 +1251,7 @@ def test_nsmallest_multiple_columns(self): 'c': np.random.permutation(10).astype('float64')}) result = df.nsmallest(5, ['a', 'c']) expected = df.sort_values(['a', 'c']).head(5) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # ---------------------------------------------------------------------- # Isin @@ -1270,13 +1265,13 @@ def test_isin(self): result = df.isin(other) expected = DataFrame([df.loc[s].isin(other) for s in df.index]) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_isin_empty(self): df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']}) result = df.isin([]) expected = pd.DataFrame(False, df.index, df.columns) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_isin_dict(self): df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']}) @@ -1286,7 +1281,7 @@ def test_isin_dict(self): expected.loc[0, 'A'] = True result = df.isin(d) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # non unique columns df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']}) @@ -1294,7 +1289,7 @@ def test_isin_dict(self): expected = DataFrame(False, df.index, df.columns) expected.loc[0, 'A'] = True result = df.isin(d) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_isin_with_string_scalar(self): # GH4763 @@ -1314,13 +1309,13 @@ def test_isin_df(self): result = df1.isin(df2) expected['A'].loc[[1, 3]] = True expected['B'].loc[[0, 2]] = True - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # partial overlapping columns df2.columns = ['A', 'C'] result = df1.isin(df2) expected['B'] = False - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_isin_df_dupe_values(self): df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]}) @@ -1348,7 +1343,7 @@ def test_isin_dupe_self(self): expected = DataFrame(False, index=df.index, columns=df.columns) expected.loc[0] = True expected.iloc[1, 1] = True - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_isin_against_series(self): df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]}, @@ -1358,7 +1353,7 @@ def test_isin_against_series(self): expected['A'].loc['a'] = True expected.loc['d'] = True result = df.isin(s) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_isin_multiIndex(self): idx = MultiIndex.from_tuples([(0, 'a', 'foo'), (0, 'a', 'bar'), @@ -1374,7 +1369,7 @@ def test_isin_multiIndex(self): # against regular index expected = DataFrame(False, index=df1.index, columns=df1.columns) result = df1.isin(df2) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df2.index = idx expected = df2.values.astype(np.bool) @@ -1382,7 +1377,7 @@ def test_isin_multiIndex(self): expected = DataFrame(expected, columns=['A', 'B'], index=idx) result = df1.isin(df2) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # ---------------------------------------------------------------------- # Row deduplication @@ -1398,43 +1393,43 @@ def test_drop_duplicates(self): # single column result = df.drop_duplicates('AAA') expected = df[:2] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('AAA', keep='last') expected = df.ix[[6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('AAA', keep=False) expected = df.ix[[]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) self.assertEqual(len(result), 0) # deprecate take_last with tm.assert_produces_warning(FutureWarning): result = df.drop_duplicates('AAA', take_last=True) expected = df.ix[[6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # multi column expected = df.ix[[0, 1, 2, 3]] result = df.drop_duplicates(np.array(['AAA', 'B'])) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates(['AAA', 'B']) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates(('AAA', 'B'), keep='last') expected = df.ix[[0, 5, 6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates(('AAA', 'B'), keep=False) expected = df.ix[[0]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # deprecate take_last with tm.assert_produces_warning(FutureWarning): result = df.drop_duplicates(('AAA', 'B'), take_last=True) expected = df.ix[[0, 5, 6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # consider everything df2 = df.ix[:, ['AAA', 'B', 'C']] @@ -1442,64 +1437,64 @@ def test_drop_duplicates(self): result = df2.drop_duplicates() # in this case only expected = df2.drop_duplicates(['AAA', 'B']) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df2.drop_duplicates(keep='last') expected = df2.drop_duplicates(['AAA', 'B'], keep='last') - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df2.drop_duplicates(keep=False) expected = df2.drop_duplicates(['AAA', 'B'], keep=False) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # deprecate take_last with tm.assert_produces_warning(FutureWarning): result = df2.drop_duplicates(take_last=True) with tm.assert_produces_warning(FutureWarning): expected = df2.drop_duplicates(['AAA', 'B'], take_last=True) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # integers result = df.drop_duplicates('C') expected = df.iloc[[0, 2]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('C', keep='last') expected = df.iloc[[-2, -1]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df['E'] = df['C'].astype('int8') result = df.drop_duplicates('E') expected = df.iloc[[0, 2]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('E', keep='last') expected = df.iloc[[-2, -1]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # GH 11376 df = pd.DataFrame({'x': [7, 6, 3, 3, 4, 8, 0], 'y': [0, 6, 5, 5, 9, 1, 2]}) expected = df.loc[df.index != 3] - assert_frame_equal(df.drop_duplicates(), expected) + tm.assert_frame_equal(df.drop_duplicates(), expected) df = pd.DataFrame([[1, 0], [0, 2]]) - assert_frame_equal(df.drop_duplicates(), df) + tm.assert_frame_equal(df.drop_duplicates(), df) df = pd.DataFrame([[-2, 0], [0, -4]]) - assert_frame_equal(df.drop_duplicates(), df) + tm.assert_frame_equal(df.drop_duplicates(), df) x = np.iinfo(np.int64).max / 3 * 2 df = pd.DataFrame([[-x, x], [0, x + 4]]) - assert_frame_equal(df.drop_duplicates(), df) + tm.assert_frame_equal(df.drop_duplicates(), df) df = pd.DataFrame([[-x, x], [x, x + 4]]) - assert_frame_equal(df.drop_duplicates(), df) + tm.assert_frame_equal(df.drop_duplicates(), df) # GH 11864 df = pd.DataFrame([i] * 9 for i in range(16)) df = df.append([[1] + [0] * 8], ignore_index=True) for keep in ['first', 'last', False]: - assert_equal(df.duplicated(keep=keep).sum(), 0) + self.assertEqual(df.duplicated(keep=keep).sum(), 0) def test_drop_duplicates_for_take_all(self): df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar', @@ -1512,28 +1507,28 @@ def test_drop_duplicates_for_take_all(self): # single column result = df.drop_duplicates('AAA') expected = df.iloc[[0, 1, 2, 6]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('AAA', keep='last') expected = df.iloc[[2, 5, 6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('AAA', keep=False) expected = df.iloc[[2, 6]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # multiple columns result = df.drop_duplicates(['AAA', 'B']) expected = df.iloc[[0, 1, 2, 3, 4, 6]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates(['AAA', 'B'], keep='last') expected = df.iloc[[0, 1, 2, 5, 6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates(['AAA', 'B'], keep=False) expected = df.iloc[[0, 1, 2, 6]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_drop_duplicates_tuple(self): df = DataFrame({('AA', 'AB'): ['foo', 'bar', 'foo', 'bar', @@ -1546,27 +1541,27 @@ def test_drop_duplicates_tuple(self): # single column result = df.drop_duplicates(('AA', 'AB')) expected = df[:2] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates(('AA', 'AB'), keep='last') expected = df.ix[[6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates(('AA', 'AB'), keep=False) expected = df.ix[[]] # empty df self.assertEqual(len(result), 0) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # deprecate take_last with tm.assert_produces_warning(FutureWarning): result = df.drop_duplicates(('AA', 'AB'), take_last=True) expected = df.ix[[6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # multi column expected = df.ix[[0, 1, 2, 3]] result = df.drop_duplicates((('AA', 'AB'), 'B')) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_drop_duplicates_NA(self): # none @@ -1580,41 +1575,41 @@ def test_drop_duplicates_NA(self): # single column result = df.drop_duplicates('A') expected = df.ix[[0, 2, 3]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('A', keep='last') expected = df.ix[[1, 6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('A', keep=False) expected = df.ix[[]] # empty df - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) self.assertEqual(len(result), 0) # deprecate take_last with tm.assert_produces_warning(FutureWarning): result = df.drop_duplicates('A', take_last=True) expected = df.ix[[1, 6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # multi column result = df.drop_duplicates(['A', 'B']) expected = df.ix[[0, 2, 3, 6]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates(['A', 'B'], keep='last') expected = df.ix[[1, 5, 6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates(['A', 'B'], keep=False) expected = df.ix[[6]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # deprecate take_last with tm.assert_produces_warning(FutureWarning): result = df.drop_duplicates(['A', 'B'], take_last=True) expected = df.ix[[1, 5, 6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # nan df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', @@ -1627,41 +1622,41 @@ def test_drop_duplicates_NA(self): # single column result = df.drop_duplicates('C') expected = df[:2] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('C', keep='last') expected = df.ix[[3, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('C', keep=False) expected = df.ix[[]] # empty df - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) self.assertEqual(len(result), 0) # deprecate take_last with tm.assert_produces_warning(FutureWarning): result = df.drop_duplicates('C', take_last=True) expected = df.ix[[3, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # multi column result = df.drop_duplicates(['C', 'B']) expected = df.ix[[0, 1, 2, 4]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates(['C', 'B'], keep='last') expected = df.ix[[1, 3, 6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates(['C', 'B'], keep=False) expected = df.ix[[1]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # deprecate take_last with tm.assert_produces_warning(FutureWarning): result = df.drop_duplicates(['C', 'B'], take_last=True) expected = df.ix[[1, 3, 6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_drop_duplicates_NA_for_take_all(self): # none @@ -1672,30 +1667,30 @@ def test_drop_duplicates_NA_for_take_all(self): # single column result = df.drop_duplicates('A') expected = df.iloc[[0, 2, 3, 5, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('A', keep='last') expected = df.iloc[[1, 4, 5, 6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('A', keep=False) expected = df.iloc[[5, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # nan # single column result = df.drop_duplicates('C') expected = df.iloc[[0, 1, 5, 6]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('C', keep='last') expected = df.iloc[[3, 5, 6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('C', keep=False) expected = df.iloc[[5, 6]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_drop_duplicates_inplace(self): orig = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', @@ -1710,19 +1705,19 @@ def test_drop_duplicates_inplace(self): df.drop_duplicates('A', inplace=True) expected = orig[:2] result = df - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df = orig.copy() df.drop_duplicates('A', keep='last', inplace=True) expected = orig.ix[[6, 7]] result = df - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df = orig.copy() df.drop_duplicates('A', keep=False, inplace=True) expected = orig.ix[[]] result = df - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) self.assertEqual(len(df), 0) # deprecate take_last @@ -1731,26 +1726,26 @@ def test_drop_duplicates_inplace(self): df.drop_duplicates('A', take_last=True, inplace=True) expected = orig.ix[[6, 7]] result = df - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # multi column df = orig.copy() df.drop_duplicates(['A', 'B'], inplace=True) expected = orig.ix[[0, 1, 2, 3]] result = df - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df = orig.copy() df.drop_duplicates(['A', 'B'], keep='last', inplace=True) expected = orig.ix[[0, 5, 6, 7]] result = df - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df = orig.copy() df.drop_duplicates(['A', 'B'], keep=False, inplace=True) expected = orig.ix[[0]] result = df - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # deprecate take_last df = orig.copy() @@ -1758,7 +1753,7 @@ def test_drop_duplicates_inplace(self): df.drop_duplicates(['A', 'B'], take_last=True, inplace=True) expected = orig.ix[[0, 5, 6, 7]] result = df - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # consider everything orig2 = orig.ix[:, ['A', 'B', 'C']].copy() @@ -1768,19 +1763,19 @@ def test_drop_duplicates_inplace(self): # in this case only expected = orig2.drop_duplicates(['A', 'B']) result = df2 - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df2 = orig2.copy() df2.drop_duplicates(keep='last', inplace=True) expected = orig2.drop_duplicates(['A', 'B'], keep='last') result = df2 - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df2 = orig2.copy() df2.drop_duplicates(keep=False, inplace=True) expected = orig2.drop_duplicates(['A', 'B'], keep=False) result = df2 - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # deprecate take_last df2 = orig2.copy() @@ -1789,7 +1784,7 @@ def test_drop_duplicates_inplace(self): with tm.assert_produces_warning(FutureWarning): expected = orig2.drop_duplicates(['A', 'B'], take_last=True) result = df2 - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # Rounding @@ -1798,26 +1793,26 @@ def test_round(self): # Test that rounding an empty DataFrame does nothing df = DataFrame() - assert_frame_equal(df, df.round()) + tm.assert_frame_equal(df, df.round()) # Here's the test frame we'll be working with - df = DataFrame( - {'col1': [1.123, 2.123, 3.123], 'col2': [1.234, 2.234, 3.234]}) + df = DataFrame({'col1': [1.123, 2.123, 3.123], + 'col2': [1.234, 2.234, 3.234]}) # Default round to integer (i.e. decimals=0) expected_rounded = DataFrame( {'col1': [1., 2., 3.], 'col2': [1., 2., 3.]}) - assert_frame_equal(df.round(), expected_rounded) + tm.assert_frame_equal(df.round(), expected_rounded) # Round with an integer decimals = 2 - expected_rounded = DataFrame( - {'col1': [1.12, 2.12, 3.12], 'col2': [1.23, 2.23, 3.23]}) - assert_frame_equal(df.round(decimals), expected_rounded) + expected_rounded = DataFrame({'col1': [1.12, 2.12, 3.12], + 'col2': [1.23, 2.23, 3.23]}) + tm.assert_frame_equal(df.round(decimals), expected_rounded) # This should also work with np.round (since np.round dispatches to # df.round) - assert_frame_equal(np.round(df, decimals), expected_rounded) + tm.assert_frame_equal(np.round(df, decimals), expected_rounded) # Round with a list round_list = [1, 2] @@ -1828,19 +1823,19 @@ def test_round(self): expected_rounded = DataFrame( {'col1': [1.1, 2.1, 3.1], 'col2': [1.23, 2.23, 3.23]}) round_dict = {'col1': 1, 'col2': 2} - assert_frame_equal(df.round(round_dict), expected_rounded) + tm.assert_frame_equal(df.round(round_dict), expected_rounded) # Incomplete dict expected_partially_rounded = DataFrame( {'col1': [1.123, 2.123, 3.123], 'col2': [1.2, 2.2, 3.2]}) partial_round_dict = {'col2': 1} - assert_frame_equal( - df.round(partial_round_dict), expected_partially_rounded) + tm.assert_frame_equal(df.round(partial_round_dict), + expected_partially_rounded) # Dict with unknown elements wrong_round_dict = {'col3': 2, 'col2': 1} - assert_frame_equal( - df.round(wrong_round_dict), expected_partially_rounded) + tm.assert_frame_equal(df.round(wrong_round_dict), + expected_partially_rounded) # float input to `decimals` non_int_round_dict = {'col1': 1, 'col2': 0.5} @@ -1879,8 +1874,8 @@ def test_round(self): big_df = df * 100 expected_neg_rounded = DataFrame( {'col1': [110., 210, 310], 'col2': [100., 200, 300]}) - assert_frame_equal( - big_df.round(negative_round_dict), expected_neg_rounded) + tm.assert_frame_equal(big_df.round(negative_round_dict), + expected_neg_rounded) # nan in Series round nan_round_Series = Series({'col1': nan, 'col2': 1}) @@ -1899,7 +1894,7 @@ def test_round(self): df.round(nan_round_Series) # Make sure this doesn't break existing Series.round - assert_series_equal(df['col1'].round(1), expected_rounded['col1']) + tm.assert_series_equal(df['col1'].round(1), expected_rounded['col1']) # named columns # GH 11986 @@ -1908,20 +1903,20 @@ def test_round(self): {'col1': [1.12, 2.12, 3.12], 'col2': [1.23, 2.23, 3.23]}) df.columns.name = "cols" expected_rounded.columns.name = "cols" - assert_frame_equal(df.round(decimals), expected_rounded) + tm.assert_frame_equal(df.round(decimals), expected_rounded) # interaction of named columns & series - assert_series_equal(df['col1'].round(decimals), - expected_rounded['col1']) - assert_series_equal(df.round(decimals)['col1'], - expected_rounded['col1']) + tm.assert_series_equal(df['col1'].round(decimals), + expected_rounded['col1']) + tm.assert_series_equal(df.round(decimals)['col1'], + expected_rounded['col1']) def test_numpy_round(self): # See gh-12600 df = DataFrame([[1.53, 1.36], [0.06, 7.01]]) out = np.round(df, decimals=0) expected = DataFrame([[2., 1.], [0., 7.]]) - assert_frame_equal(out, expected) + tm.assert_frame_equal(out, expected) msg = "the 'out' parameter is not supported" with tm.assertRaisesRegexp(ValueError, msg): @@ -1935,12 +1930,12 @@ def test_round_mixed_type(self): round_0 = DataFrame({'col1': [1., 2., 3., 4.], 'col2': ['1', 'a', 'c', 'f'], 'col3': date_range('20111111', periods=4)}) - assert_frame_equal(df.round(), round_0) - assert_frame_equal(df.round(1), df) - assert_frame_equal(df.round({'col1': 1}), df) - assert_frame_equal(df.round({'col1': 0}), round_0) - assert_frame_equal(df.round({'col1': 0, 'col2': 1}), round_0) - assert_frame_equal(df.round({'col3': 1}), df) + tm.assert_frame_equal(df.round(), round_0) + tm.assert_frame_equal(df.round(1), df) + tm.assert_frame_equal(df.round({'col1': 1}), df) + tm.assert_frame_equal(df.round({'col1': 0}), round_0) + tm.assert_frame_equal(df.round({'col1': 0, 'col2': 1}), round_0) + tm.assert_frame_equal(df.round({'col3': 1}), df) def test_round_issue(self): # GH11611 @@ -1950,7 +1945,7 @@ def test_round_issue(self): dfs = pd.concat((df, df), axis=1) rounded = dfs.round() - self.assertTrue(rounded.index.equals(dfs.index)) + self.assert_index_equal(rounded.index, dfs.index) decimals = pd.Series([1, 0, 2], index=['A', 'B', 'A']) self.assertRaises(ValueError, df.round, decimals) @@ -1968,7 +1963,7 @@ def test_built_in_round(self): # Default round to integer (i.e. decimals=0) expected_rounded = DataFrame( {'col1': [1., 2., 3.], 'col2': [1., 2., 3.]}) - assert_frame_equal(round(df), expected_rounded) + tm.assert_frame_equal(round(df), expected_rounded) # Clip @@ -2015,14 +2010,14 @@ def test_clip_against_series(self): mask = ~lb_mask & ~ub_mask result = clipped_df.loc[lb_mask, i] - assert_series_equal(result, lb[lb_mask], check_names=False) + tm.assert_series_equal(result, lb[lb_mask], check_names=False) self.assertEqual(result.name, i) result = clipped_df.loc[ub_mask, i] - assert_series_equal(result, ub[ub_mask], check_names=False) + tm.assert_series_equal(result, ub[ub_mask], check_names=False) self.assertEqual(result.name, i) - assert_series_equal(clipped_df.loc[mask, i], df.loc[mask, i]) + tm.assert_series_equal(clipped_df.loc[mask, i], df.loc[mask, i]) def test_clip_against_frame(self): df = DataFrame(np.random.randn(1000, 2)) @@ -2035,9 +2030,9 @@ def test_clip_against_frame(self): ub_mask = df >= ub mask = ~lb_mask & ~ub_mask - assert_frame_equal(clipped_df[lb_mask], lb[lb_mask]) - assert_frame_equal(clipped_df[ub_mask], ub[ub_mask]) - assert_frame_equal(clipped_df[mask], df[mask]) + tm.assert_frame_equal(clipped_df[lb_mask], lb[lb_mask]) + tm.assert_frame_equal(clipped_df[ub_mask], ub[ub_mask]) + tm.assert_frame_equal(clipped_df[mask], df[mask]) # Matrix-like @@ -2054,15 +2049,15 @@ def test_dot(self): # Check alignment b1 = b.reindex(index=reversed(b.index)) result = a.dot(b) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # Check series argument result = a.dot(b['one']) - assert_series_equal(result, expected['one'], check_names=False) + tm.assert_series_equal(result, expected['one'], check_names=False) self.assertTrue(result.name is None) result = a.dot(b1['one']) - assert_series_equal(result, expected['one'], check_names=False) + tm.assert_series_equal(result, expected['one'], check_names=False) self.assertTrue(result.name is None) # can pass correct-length arrays @@ -2070,9 +2065,9 @@ def test_dot(self): result = a.dot(row) exp = a.dot(a.ix[0]) - assert_series_equal(result, exp) + tm.assert_series_equal(result, exp) - with assertRaisesRegexp(ValueError, 'Dot product shape mismatch'): + with tm.assertRaisesRegexp(ValueError, 'Dot product shape mismatch'): a.dot(row[:-1]) a = np.random.rand(1, 5) @@ -2089,7 +2084,8 @@ def test_dot(self): df = DataFrame(randn(3, 4), index=[1, 2, 3], columns=lrange(4)) df2 = DataFrame(randn(5, 3), index=lrange(5), columns=[1, 2, 3]) - assertRaisesRegexp(ValueError, 'aligned', df.dot, df2) + with tm.assertRaisesRegexp(ValueError, 'aligned'): + df.dot(df2) if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index 09dd0f3b14812..9da1b31d259c5 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -221,7 +221,7 @@ def test_reindex(self): # pass non-Index newFrame = self.frame.reindex(list(self.ts1.index)) - self.assertTrue(newFrame.index.equals(self.ts1.index)) + self.assert_index_equal(newFrame.index, self.ts1.index) # copy with no axes result = self.frame.reindex() @@ -381,7 +381,7 @@ def test_align(self): # axis = 0 other = self.frame.ix[:-5, :3] af, bf = self.frame.align(other, axis=0, fill_value=-1) - self.assertTrue(bf.columns.equals(other.columns)) + self.assert_index_equal(bf.columns, other.columns) # test fill value join_idx = self.frame.index.join(other.index) diff_a = self.frame.index.difference(join_idx) @@ -391,15 +391,15 @@ def test_align(self): self.assertTrue((diff_a_vals == -1).all()) af, bf = self.frame.align(other, join='right', axis=0) - self.assertTrue(bf.columns.equals(other.columns)) - self.assertTrue(bf.index.equals(other.index)) - self.assertTrue(af.index.equals(other.index)) + self.assert_index_equal(bf.columns, other.columns) + self.assert_index_equal(bf.index, other.index) + self.assert_index_equal(af.index, other.index) # axis = 1 other = self.frame.ix[:-5, :3].copy() af, bf = self.frame.align(other, axis=1) - self.assertTrue(bf.columns.equals(self.frame.columns)) - self.assertTrue(bf.index.equals(other.index)) + self.assert_index_equal(bf.columns, self.frame.columns) + self.assert_index_equal(bf.index, other.index) # test fill value join_idx = self.frame.index.join(other.index) @@ -413,35 +413,35 @@ def test_align(self): self.assertTrue((diff_a_vals == -1).all()) af, bf = self.frame.align(other, join='inner', axis=1) - self.assertTrue(bf.columns.equals(other.columns)) + self.assert_index_equal(bf.columns, other.columns) af, bf = self.frame.align(other, join='inner', axis=1, method='pad') - self.assertTrue(bf.columns.equals(other.columns)) + self.assert_index_equal(bf.columns, other.columns) # test other non-float types af, bf = self.intframe.align(other, join='inner', axis=1, method='pad') - self.assertTrue(bf.columns.equals(other.columns)) + self.assert_index_equal(bf.columns, other.columns) af, bf = self.mixed_frame.align(self.mixed_frame, join='inner', axis=1, method='pad') - self.assertTrue(bf.columns.equals(self.mixed_frame.columns)) + self.assert_index_equal(bf.columns, self.mixed_frame.columns) af, bf = self.frame.align(other.ix[:, 0], join='inner', axis=1, method=None, fill_value=None) - self.assertTrue(bf.index.equals(Index([]))) + self.assert_index_equal(bf.index, Index([])) af, bf = self.frame.align(other.ix[:, 0], join='inner', axis=1, method=None, fill_value=0) - self.assertTrue(bf.index.equals(Index([]))) + self.assert_index_equal(bf.index, Index([])) # mixed floats/ints af, bf = self.mixed_float.align(other.ix[:, 0], join='inner', axis=1, method=None, fill_value=0) - self.assertTrue(bf.index.equals(Index([]))) + self.assert_index_equal(bf.index, Index([])) af, bf = self.mixed_int.align(other.ix[:, 0], join='inner', axis=1, method=None, fill_value=0) - self.assertTrue(bf.index.equals(Index([]))) + self.assert_index_equal(bf.index, Index([])) # try to align dataframe to series along bad axis self.assertRaises(ValueError, self.frame.align, af.ix[0, :3], @@ -661,8 +661,24 @@ def test_filter(self): assert_frame_equal(filtered, expected) # pass in None + with assertRaisesRegexp(TypeError, 'Must pass'): + self.frame.filter() with assertRaisesRegexp(TypeError, 'Must pass'): self.frame.filter(items=None) + with assertRaisesRegexp(TypeError, 'Must pass'): + self.frame.filter(axis=1) + + # test mutually exclusive arguments + with assertRaisesRegexp(TypeError, 'mutually exclusive'): + self.frame.filter(items=['one', 'three'], regex='e$', like='bbi') + with assertRaisesRegexp(TypeError, 'mutually exclusive'): + self.frame.filter(items=['one', 'three'], regex='e$', axis=1) + with assertRaisesRegexp(TypeError, 'mutually exclusive'): + self.frame.filter(items=['one', 'three'], regex='e$') + with assertRaisesRegexp(TypeError, 'mutually exclusive'): + self.frame.filter(items=['one', 'three'], like='bbi', axis=0) + with assertRaisesRegexp(TypeError, 'mutually exclusive'): + self.frame.filter(items=['one', 'three'], like='bbi') # objects filtered = self.mixed_frame.filter(like='foo') @@ -810,10 +826,9 @@ def test_reindex_corner(self): index = Index(['a', 'b', 'c']) dm = self.empty.reindex(index=[1, 2, 3]) reindexed = dm.reindex(columns=index) - self.assertTrue(reindexed.columns.equals(index)) + self.assert_index_equal(reindexed.columns, index) # ints are weird - smaller = self.intframe.reindex(columns=['A', 'B', 'E']) self.assertEqual(smaller['E'].dtype, np.float64) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index f337bf48c05ee..0421cf2ba42d2 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -505,8 +505,8 @@ def test_get_X_columns(self): 'd': [None, None, None], 'e': [3.14, 0.577, 2.773]}) - self.assert_numpy_array_equal(df._get_numeric_data().columns, - ['a', 'b', 'e']) + self.assert_index_equal(df._get_numeric_data().columns, + pd.Index(['a', 'b', 'e'])) def test_strange_column_corruption_issue(self): # (wesm) Unclear how exactly this is related to internal matters diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 083da2a040ed5..b42aef9447373 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -17,21 +17,13 @@ from pandas.compat import (lmap, long, zip, range, lrange, lzip, OrderedDict, is_platform_little_endian) from pandas import compat -from pandas import (DataFrame, Index, Series, notnull, isnull, +from pandas import (DataFrame, Index, Series, isnull, MultiIndex, Timedelta, Timestamp, date_range) from pandas.core.common import PandasError import pandas as pd import pandas.core.common as com import pandas.lib as lib - -from pandas.types.api import DatetimeTZDtype - -from pandas.util.testing import (assert_numpy_array_equal, - assert_series_equal, - assert_frame_equal, - assertRaisesRegexp) - import pandas.util.testing as tm from pandas.tests.frame.common import TestData @@ -173,16 +165,16 @@ def test_constructor_rec(self): index = self.frame.index df = DataFrame(rec) - self.assert_numpy_array_equal(df.columns, rec.dtype.names) + self.assert_index_equal(df.columns, pd.Index(rec.dtype.names)) df2 = DataFrame(rec, index=index) - self.assert_numpy_array_equal(df2.columns, rec.dtype.names) - self.assertTrue(df2.index.equals(index)) + self.assert_index_equal(df2.columns, pd.Index(rec.dtype.names)) + self.assert_index_equal(df2.index, index) rng = np.arange(len(rec))[::-1] df3 = DataFrame(rec, index=rng, columns=['C', 'B']) expected = DataFrame(rec, index=rng).reindex(columns=['C', 'B']) - assert_frame_equal(df3, expected) + tm.assert_frame_equal(df3, expected) def test_constructor_bool(self): df = DataFrame({0: np.ones(10, dtype=bool), @@ -220,8 +212,15 @@ def test_constructor_dict(self): frame = DataFrame({'col1': self.ts1, 'col2': self.ts2}) - tm.assert_dict_equal(self.ts1, frame['col1'], compare_keys=False) - tm.assert_dict_equal(self.ts2, frame['col2'], compare_keys=False) + # col2 is padded with NaN + self.assertEqual(len(self.ts1), 30) + self.assertEqual(len(self.ts2), 25) + + tm.assert_series_equal(self.ts1, frame['col1'], check_names=False) + + exp = pd.Series(np.concatenate([[np.nan] * 5, self.ts2.values]), + index=self.ts1.index, name='col2') + tm.assert_series_equal(exp, frame['col2']) frame = DataFrame({'col1': self.ts1, 'col2': self.ts2}, @@ -241,7 +240,7 @@ def test_constructor_dict(self): # Length-one dict micro-optimization frame = DataFrame({'A': {'1': 1, '2': 2}}) - self.assert_numpy_array_equal(frame.index, ['1', '2']) + self.assert_index_equal(frame.index, pd.Index(['1', '2'])) # empty dict plus index idx = Index([0, 1, 2]) @@ -257,7 +256,7 @@ def test_constructor_dict(self): # with dict of empty list and Series frame = DataFrame({'A': [], 'B': []}, columns=['A', 'B']) - self.assertTrue(frame.index.equals(Index([]))) + self.assert_index_equal(frame.index, Index([], dtype=np.int64)) # GH10856 # dict with scalar values should raise error, even if columns passed @@ -286,37 +285,37 @@ def test_constructor_multi_index(self): def test_constructor_error_msgs(self): msg = "Empty data passed with indices specified." # passing an empty array with columns specified. - with assertRaisesRegexp(ValueError, msg): + with tm.assertRaisesRegexp(ValueError, msg): DataFrame(np.empty(0), columns=list('abc')) msg = "Mixing dicts with non-Series may lead to ambiguous ordering." # mix dict and array, wrong size - with assertRaisesRegexp(ValueError, msg): + with tm.assertRaisesRegexp(ValueError, msg): DataFrame({'A': {'a': 'a', 'b': 'b'}, 'B': ['a', 'b', 'c']}) # wrong size ndarray, GH 3105 msg = "Shape of passed values is \(3, 4\), indices imply \(3, 3\)" - with assertRaisesRegexp(ValueError, msg): + with tm.assertRaisesRegexp(ValueError, msg): DataFrame(np.arange(12).reshape((4, 3)), columns=['foo', 'bar', 'baz'], index=pd.date_range('2000-01-01', periods=3)) # higher dim raise exception - with assertRaisesRegexp(ValueError, 'Must pass 2-d input'): + with tm.assertRaisesRegexp(ValueError, 'Must pass 2-d input'): DataFrame(np.zeros((3, 3, 3)), columns=['A', 'B', 'C'], index=[1]) # wrong size axis labels - with assertRaisesRegexp(ValueError, "Shape of passed values is " - "\(3, 2\), indices imply \(3, 1\)"): + with tm.assertRaisesRegexp(ValueError, "Shape of passed values is " + "\(3, 2\), indices imply \(3, 1\)"): DataFrame(np.random.rand(2, 3), columns=['A', 'B', 'C'], index=[1]) - with assertRaisesRegexp(ValueError, "Shape of passed values is " - "\(3, 2\), indices imply \(2, 2\)"): + with tm.assertRaisesRegexp(ValueError, "Shape of passed values is " + "\(3, 2\), indices imply \(2, 2\)"): DataFrame(np.random.rand(2, 3), columns=['A', 'B'], index=[1, 2]) - with assertRaisesRegexp(ValueError, 'If using all scalar values, you ' - 'must pass an index'): + with tm.assertRaisesRegexp(ValueError, 'If using all scalar values, ' + 'you must pass an index'): DataFrame({'a': False, 'b': True}) def test_constructor_with_embedded_frames(self): @@ -329,10 +328,10 @@ def test_constructor_with_embedded_frames(self): str(df2) result = df2.loc[0, 0] - assert_frame_equal(result, df1) + tm.assert_frame_equal(result, df1) result = df2.loc[1, 0] - assert_frame_equal(result, df1 + 10) + tm.assert_frame_equal(result, df1 + 10) def test_constructor_subclass_dict(self): # Test for passing dict subclass to constructor @@ -341,11 +340,11 @@ def test_constructor_subclass_dict(self): df = DataFrame(data) refdf = DataFrame(dict((col, dict(compat.iteritems(val))) for col, val in compat.iteritems(data))) - assert_frame_equal(refdf, df) + tm.assert_frame_equal(refdf, df) data = tm.TestSubDict(compat.iteritems(data)) df = DataFrame(data) - assert_frame_equal(refdf, df) + tm.assert_frame_equal(refdf, df) # try with defaultdict from collections import defaultdict @@ -356,10 +355,10 @@ def test_constructor_subclass_dict(self): dct.update(v.to_dict()) data[k] = dct frame = DataFrame(data) - assert_frame_equal(self.frame.sort_index(), frame) + tm.assert_frame_equal(self.frame.sort_index(), frame) def test_constructor_dict_block(self): - expected = [[4., 3., 2., 1.]] + expected = np.array([[4., 3., 2., 1.]]) df = DataFrame({'d': [4.], 'c': [3.], 'b': [2.], 'a': [1.]}, columns=['d', 'c', 'b', 'a']) tm.assert_numpy_array_equal(df.values, expected) @@ -405,10 +404,10 @@ def test_constructor_dict_of_tuples(self): result = DataFrame(data) expected = DataFrame(dict((k, list(v)) for k, v in compat.iteritems(data))) - assert_frame_equal(result, expected, check_dtype=False) + tm.assert_frame_equal(result, expected, check_dtype=False) def test_constructor_dict_multiindex(self): - check = lambda result, expected: assert_frame_equal( + check = lambda result, expected: tm.assert_frame_equal( result, expected, check_dtype=True, check_index_type=True, check_column_type=True, check_names=True) d = {('a', 'a'): {('i', 'i'): 0, ('i', 'j'): 1, ('j', 'i'): 2}, @@ -453,9 +452,9 @@ def create_data(constructor): result_datetime64 = DataFrame(data_datetime64) result_datetime = DataFrame(data_datetime) result_Timestamp = DataFrame(data_Timestamp) - assert_frame_equal(result_datetime64, expected) - assert_frame_equal(result_datetime, expected) - assert_frame_equal(result_Timestamp, expected) + tm.assert_frame_equal(result_datetime64, expected) + tm.assert_frame_equal(result_datetime, expected) + tm.assert_frame_equal(result_Timestamp, expected) def test_constructor_dict_timedelta64_index(self): # GH 10160 @@ -478,9 +477,9 @@ def create_data(constructor): result_timedelta64 = DataFrame(data_timedelta64) result_timedelta = DataFrame(data_timedelta) result_Timedelta = DataFrame(data_Timedelta) - assert_frame_equal(result_timedelta64, expected) - assert_frame_equal(result_timedelta, expected) - assert_frame_equal(result_Timedelta, expected) + tm.assert_frame_equal(result_timedelta64, expected) + tm.assert_frame_equal(result_timedelta, expected) + tm.assert_frame_equal(result_Timedelta, expected) def test_constructor_period(self): # PeriodIndex @@ -506,7 +505,7 @@ def test_nested_dict_frame_constructor(self): data.setdefault(col, {})[row] = df.get_value(row, col) result = DataFrame(data, columns=rng) - assert_frame_equal(result, df) + tm.assert_frame_equal(result, df) data = {} for col in df.columns: @@ -514,7 +513,7 @@ def test_nested_dict_frame_constructor(self): data.setdefault(row, {})[col] = df.get_value(row, col) result = DataFrame(data, index=rng).T - assert_frame_equal(result, df) + tm.assert_frame_equal(result, df) def _check_basic_constructor(self, empty): # mat: 2d matrix with shpae (3, 2) to input. empty - makes sized @@ -538,27 +537,27 @@ def _check_basic_constructor(self, empty): # wrong size axis labels msg = r'Shape of passed values is \(3, 2\), indices imply \(3, 1\)' - with assertRaisesRegexp(ValueError, msg): + with tm.assertRaisesRegexp(ValueError, msg): DataFrame(mat, columns=['A', 'B', 'C'], index=[1]) msg = r'Shape of passed values is \(3, 2\), indices imply \(2, 2\)' - with assertRaisesRegexp(ValueError, msg): + with tm.assertRaisesRegexp(ValueError, msg): DataFrame(mat, columns=['A', 'B'], index=[1, 2]) # higher dim raise exception - with assertRaisesRegexp(ValueError, 'Must pass 2-d input'): + with tm.assertRaisesRegexp(ValueError, 'Must pass 2-d input'): DataFrame(empty((3, 3, 3)), columns=['A', 'B', 'C'], index=[1]) # automatic labeling frame = DataFrame(mat) - self.assert_numpy_array_equal(frame.index, lrange(2)) - self.assert_numpy_array_equal(frame.columns, lrange(3)) + self.assert_index_equal(frame.index, pd.Index(lrange(2))) + self.assert_index_equal(frame.columns, pd.Index(lrange(3))) frame = DataFrame(mat, index=[1, 2]) - self.assert_numpy_array_equal(frame.columns, lrange(3)) + self.assert_index_equal(frame.columns, pd.Index(lrange(3))) frame = DataFrame(mat, columns=['A', 'B', 'C']) - self.assert_numpy_array_equal(frame.index, lrange(2)) + self.assert_index_equal(frame.index, pd.Index(lrange(2))) # 0-length axis frame = DataFrame(empty((0, 3))) @@ -660,7 +659,7 @@ def test_constructor_mrecarray(self): # Ensure mrecarray produces frame identical to dict of masked arrays # from GH3479 - assert_fr_equal = functools.partial(assert_frame_equal, + assert_fr_equal = functools.partial(tm.assert_frame_equal, check_index_type=True, check_column_type=True, check_frame_type=True) @@ -734,13 +733,13 @@ def test_constructor_arrays_and_scalars(self): df = DataFrame({'a': randn(10), 'b': True}) exp = DataFrame({'a': df['a'].values, 'b': [True] * 10}) - assert_frame_equal(df, exp) + tm.assert_frame_equal(df, exp) with tm.assertRaisesRegexp(ValueError, 'must pass an index'): DataFrame({'a': False, 'b': True}) def test_constructor_DataFrame(self): df = DataFrame(self.frame) - assert_frame_equal(df, self.frame) + tm.assert_frame_equal(df, self.frame) df_casted = DataFrame(self.frame, dtype=np.int64) self.assertEqual(df_casted.values.dtype, np.int64) @@ -768,17 +767,17 @@ def test_constructor_more(self): # corner, silly # TODO: Fix this Exception to be better... - with assertRaisesRegexp(PandasError, 'constructor not ' - 'properly called'): + with tm.assertRaisesRegexp(PandasError, 'constructor not ' + 'properly called'): DataFrame((1, 2, 3)) # can't cast mat = np.array(['foo', 'bar'], dtype=object).reshape(2, 1) - with assertRaisesRegexp(ValueError, 'cast'): + with tm.assertRaisesRegexp(ValueError, 'cast'): DataFrame(mat, index=[0, 1], columns=[0], dtype=float) dm = DataFrame(DataFrame(self.frame._series)) - assert_frame_equal(dm, self.frame) + tm.assert_frame_equal(dm, self.frame) # int cast dm = DataFrame({'A': np.ones(10, dtype=int), @@ -791,12 +790,12 @@ def test_constructor_more(self): def test_constructor_empty_list(self): df = DataFrame([], index=[]) expected = DataFrame(index=[]) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # GH 9939 df = DataFrame([], columns=['A', 'B']) expected = DataFrame({}, columns=['A', 'B']) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # Empty generator: list(empty_gen()) == [] def empty_gen(): @@ -804,7 +803,7 @@ def empty_gen(): yield df = DataFrame(empty_gen(), columns=['A', 'B']) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_constructor_list_of_lists(self): # GH #484 @@ -818,7 +817,7 @@ def test_constructor_list_of_lists(self): expected = DataFrame({0: range(10)}) data = [np.array(x) for x in range(10)] result = DataFrame(data) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_constructor_sequence_like(self): # GH 3783 @@ -840,25 +839,25 @@ def __len__(self, n): columns = ["num", "str"] result = DataFrame(l, columns=columns) expected = DataFrame([[1, 'a'], [2, 'b']], columns=columns) - assert_frame_equal(result, expected, check_dtype=False) + tm.assert_frame_equal(result, expected, check_dtype=False) # GH 4297 # support Array import array result = DataFrame.from_items([('A', array.array('i', range(10)))]) expected = DataFrame({'A': list(range(10))}) - assert_frame_equal(result, expected, check_dtype=False) + tm.assert_frame_equal(result, expected, check_dtype=False) expected = DataFrame([list(range(10)), list(range(10))]) result = DataFrame([array.array('i', range(10)), array.array('i', range(10))]) - assert_frame_equal(result, expected, check_dtype=False) + tm.assert_frame_equal(result, expected, check_dtype=False) def test_constructor_iterator(self): expected = DataFrame([list(range(10)), list(range(10))]) result = DataFrame([range(10), range(10)]) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_constructor_generator(self): # related #2305 @@ -868,12 +867,12 @@ def test_constructor_generator(self): expected = DataFrame([list(range(10)), list(range(10))]) result = DataFrame([gen1, gen2]) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) gen = ([i, 'a'] for i in range(10)) result = DataFrame(gen) expected = DataFrame({0: range(10), 1: 'a'}) - assert_frame_equal(result, expected, check_dtype=False) + tm.assert_frame_equal(result, expected, check_dtype=False) def test_constructor_list_of_dicts(self): data = [OrderedDict([['a', 1.5], ['b', 3], ['c', 4], ['d', 6]]), @@ -886,11 +885,50 @@ def test_constructor_list_of_dicts(self): result = DataFrame(data) expected = DataFrame.from_dict(dict(zip(range(len(data)), data)), orient='index') - assert_frame_equal(result, expected.reindex(result.index)) + tm.assert_frame_equal(result, expected.reindex(result.index)) result = DataFrame([{}]) expected = DataFrame(index=[0]) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) + + def test_constructor_ordered_dict_preserve_order(self): + # see gh-13304 + expected = DataFrame([[2, 1]], columns=['b', 'a']) + + data = OrderedDict() + data['b'] = [2] + data['a'] = [1] + + result = DataFrame(data) + tm.assert_frame_equal(result, expected) + + data = OrderedDict() + data['b'] = 2 + data['a'] = 1 + + result = DataFrame([data]) + tm.assert_frame_equal(result, expected) + + def test_constructor_ordered_dict_conflicting_orders(self): + # the first dict element sets the ordering for the DataFrame, + # even if there are conflicting orders from subsequent ones + row_one = OrderedDict() + row_one['b'] = 2 + row_one['a'] = 1 + + row_two = OrderedDict() + row_two['a'] = 1 + row_two['b'] = 2 + + row_three = {'b': 2, 'a': 1} + + expected = DataFrame([[2, 1], [2, 1]], columns=['b', 'a']) + result = DataFrame([row_one, row_two]) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[2, 1], [2, 1], [2, 1]], columns=['b', 'a']) + result = DataFrame([row_one, row_two, row_three]) + tm.assert_frame_equal(result, expected) def test_constructor_list_of_series(self): data = [OrderedDict([['a', 1.5], ['b', 3.0], ['c', 4.0]]), @@ -903,7 +941,7 @@ def test_constructor_list_of_series(self): Series([1.5, 3, 6], idx, name='y')] result = DataFrame(data2) expected = DataFrame.from_dict(sdict, orient='index') - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # some unnamed data2 = [Series([1.5, 3, 4], idx, dtype='O', name='x'), @@ -912,7 +950,7 @@ def test_constructor_list_of_series(self): sdict = OrderedDict(zip(['x', 'Unnamed 0'], data)) expected = DataFrame.from_dict(sdict, orient='index') - assert_frame_equal(result.sort_index(), expected) + tm.assert_frame_equal(result.sort_index(), expected) # none named data = [OrderedDict([['a', 1.5], ['b', 3], ['c', 4], ['d', 6]]), @@ -926,14 +964,14 @@ def test_constructor_list_of_series(self): result = DataFrame(data) sdict = OrderedDict(zip(range(len(data)), data)) expected = DataFrame.from_dict(sdict, orient='index') - assert_frame_equal(result, expected.reindex(result.index)) + tm.assert_frame_equal(result, expected.reindex(result.index)) result2 = DataFrame(data, index=np.arange(6)) - assert_frame_equal(result, result2) + tm.assert_frame_equal(result, result2) result = DataFrame([Series({})]) expected = DataFrame(index=[0]) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) data = [OrderedDict([['a', 1.5], ['b', 3.0], ['c', 4.0]]), OrderedDict([['a', 1.5], ['b', 3.0], ['c', 6.0]])] @@ -944,7 +982,7 @@ def test_constructor_list_of_series(self): Series([1.5, 3, 6], idx)] result = DataFrame(data2) expected = DataFrame.from_dict(sdict, orient='index') - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_constructor_list_of_derived_dicts(self): class CustomDict(dict): @@ -956,19 +994,20 @@ class CustomDict(dict): result_custom = DataFrame(data_custom) result = DataFrame(data) - assert_frame_equal(result, result_custom) + tm.assert_frame_equal(result, result_custom) def test_constructor_ragged(self): data = {'A': randn(10), 'B': randn(8)} - with assertRaisesRegexp(ValueError, 'arrays must all be same length'): + with tm.assertRaisesRegexp(ValueError, + 'arrays must all be same length'): DataFrame(data) def test_constructor_scalar(self): idx = Index(lrange(3)) df = DataFrame({"a": 0}, index=idx) expected = DataFrame({"a": [0, 0, 0]}, index=idx) - assert_frame_equal(df, expected, check_dtype=False) + tm.assert_frame_equal(df, expected, check_dtype=False) def test_constructor_Series_copy_bug(self): df = DataFrame(self.frame['A'], index=self.frame.index, columns=['A']) @@ -983,7 +1022,7 @@ def test_constructor_mixed_dict_and_Series(self): self.assertTrue(result.index.is_monotonic) # ordering ambiguous, raise exception - with assertRaisesRegexp(ValueError, 'ambiguous ordering'): + with tm.assertRaisesRegexp(ValueError, 'ambiguous ordering'): DataFrame({'A': ['a', 'b'], 'B': {'a': 'a', 'b': 'b'}}) # this is OK though @@ -991,12 +1030,12 @@ def test_constructor_mixed_dict_and_Series(self): 'B': Series(['a', 'b'], index=['a', 'b'])}) expected = DataFrame({'A': ['a', 'b'], 'B': ['a', 'b']}, index=['a', 'b']) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_constructor_tuples(self): result = DataFrame({'A': [(1, 2), (3, 4)]}) expected = DataFrame({'A': Series([(1, 2), (3, 4)])}) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_constructor_namedtuples(self): # GH11181 @@ -1005,43 +1044,43 @@ def test_constructor_namedtuples(self): tuples = [named_tuple(1, 3), named_tuple(2, 4)] expected = DataFrame({'a': [1, 2], 'b': [3, 4]}) result = DataFrame(tuples) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # with columns expected = DataFrame({'y': [1, 2], 'z': [3, 4]}) result = DataFrame(tuples, columns=['y', 'z']) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_constructor_orient(self): data_dict = self.mixed_frame.T._series recons = DataFrame.from_dict(data_dict, orient='index') expected = self.mixed_frame.sort_index() - assert_frame_equal(recons, expected) + tm.assert_frame_equal(recons, expected) # dict of sequence a = {'hi': [32, 3, 3], 'there': [3, 5, 3]} rs = DataFrame.from_dict(a, orient='index') xp = DataFrame.from_dict(a).T.reindex(list(a.keys())) - assert_frame_equal(rs, xp) + tm.assert_frame_equal(rs, xp) def test_constructor_Series_named(self): a = Series([1, 2, 3], index=['a', 'b', 'c'], name='x') df = DataFrame(a) self.assertEqual(df.columns[0], 'x') - self.assertTrue(df.index.equals(a.index)) + self.assert_index_equal(df.index, a.index) # ndarray like arr = np.random.randn(10) s = Series(arr, name='x') df = DataFrame(s) expected = DataFrame(dict(x=s)) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) s = Series(arr, index=range(3, 13)) df = DataFrame(s) expected = DataFrame({0: s}) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) self.assertRaises(ValueError, DataFrame, s, columns=[1, 2]) @@ -1055,12 +1094,12 @@ def test_constructor_Series_named(self): df = DataFrame([s1, arr]).T expected = DataFrame({'x': s1, 'Unnamed 0': arr}, columns=['x', 'Unnamed 0']) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # this is a bit non-intuitive here; the series collapse down to arrays df = DataFrame([arr, s1]).T expected = DataFrame({1: s1, 0: arr}, columns=[0, 1]) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_constructor_Series_differently_indexed(self): # name @@ -1074,13 +1113,13 @@ def test_constructor_Series_differently_indexed(self): df1 = DataFrame(s1, index=other_index) exp1 = DataFrame(s1.reindex(other_index)) self.assertEqual(df1.columns[0], 'x') - assert_frame_equal(df1, exp1) + tm.assert_frame_equal(df1, exp1) df2 = DataFrame(s2, index=other_index) exp2 = DataFrame(s2.reindex(other_index)) self.assertEqual(df2.columns[0], 0) - self.assertTrue(df2.index.equals(other_index)) - assert_frame_equal(df2, exp2) + self.assert_index_equal(df2.index, other_index) + tm.assert_frame_equal(df2, exp2) def test_constructor_manager_resize(self): index = list(self.frame.index[:5]) @@ -1088,17 +1127,17 @@ def test_constructor_manager_resize(self): result = DataFrame(self.frame._data, index=index, columns=columns) - self.assert_numpy_array_equal(result.index, index) - self.assert_numpy_array_equal(result.columns, columns) + self.assert_index_equal(result.index, Index(index)) + self.assert_index_equal(result.columns, Index(columns)) def test_constructor_from_items(self): items = [(c, self.frame[c]) for c in self.frame.columns] recons = DataFrame.from_items(items) - assert_frame_equal(recons, self.frame) + tm.assert_frame_equal(recons, self.frame) # pass some columns recons = DataFrame.from_items(items, columns=['C', 'B', 'A']) - assert_frame_equal(recons, self.frame.ix[:, ['C', 'B', 'A']]) + tm.assert_frame_equal(recons, self.frame.ix[:, ['C', 'B', 'A']]) # orient='index' @@ -1108,7 +1147,7 @@ def test_constructor_from_items(self): recons = DataFrame.from_items(row_items, columns=self.mixed_frame.columns, orient='index') - assert_frame_equal(recons, self.mixed_frame) + tm.assert_frame_equal(recons, self.mixed_frame) self.assertEqual(recons['A'].dtype, np.float64) with tm.assertRaisesRegexp(TypeError, @@ -1124,7 +1163,7 @@ def test_constructor_from_items(self): recons = DataFrame.from_items(row_items, columns=self.mixed_frame.columns, orient='index') - assert_frame_equal(recons, self.mixed_frame) + tm.assert_frame_equal(recons, self.mixed_frame) tm.assertIsInstance(recons['foo'][0], tuple) rs = DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])], @@ -1132,12 +1171,12 @@ def test_constructor_from_items(self): columns=['one', 'two', 'three']) xp = DataFrame([[1, 2, 3], [4, 5, 6]], index=['A', 'B'], columns=['one', 'two', 'three']) - assert_frame_equal(rs, xp) + tm.assert_frame_equal(rs, xp) def test_constructor_mix_series_nonseries(self): df = DataFrame({'A': self.frame['A'], 'B': list(self.frame['B'])}, columns=['A', 'B']) - assert_frame_equal(df, self.frame.ix[:, ['A', 'B']]) + tm.assert_frame_equal(df, self.frame.ix[:, ['A', 'B']]) with tm.assertRaisesRegexp(ValueError, 'does not match index length'): DataFrame({'A': self.frame['A'], 'B': list(self.frame['B'])[:-2]}) @@ -1145,10 +1184,10 @@ def test_constructor_mix_series_nonseries(self): def test_constructor_miscast_na_int_dtype(self): df = DataFrame([[np.nan, 1], [1, 0]], dtype=np.int64) expected = DataFrame([[np.nan, 1], [1, 0]]) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_constructor_iterator_failure(self): - with assertRaisesRegexp(TypeError, 'iterator'): + with tm.assertRaisesRegexp(TypeError, 'iterator'): df = DataFrame(iter([1, 2, 3])) # noqa def test_constructor_column_duplicates(self): @@ -1157,11 +1196,11 @@ def test_constructor_column_duplicates(self): edf = DataFrame([[8, 5]]) edf.columns = ['a', 'a'] - assert_frame_equal(df, edf) + tm.assert_frame_equal(df, edf) idf = DataFrame.from_items( [('a', [8]), ('a', [5])], columns=['a', 'a']) - assert_frame_equal(idf, edf) + tm.assert_frame_equal(idf, edf) self.assertRaises(ValueError, DataFrame.from_items, [('a', [8]), ('a', [5]), ('b', [6])], @@ -1172,30 +1211,29 @@ def test_constructor_empty_with_string_dtype(self): expected = DataFrame(index=[0, 1], columns=[0, 1], dtype=object) df = DataFrame(index=[0, 1], columns=[0, 1], dtype=str) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.str_) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.unicode_) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = DataFrame(index=[0, 1], columns=[0, 1], dtype='U5') - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_constructor_single_value(self): # expecting single value upcasting here df = DataFrame(0., index=[1, 2, 3], columns=['a', 'b', 'c']) - assert_frame_equal(df, DataFrame(np.zeros(df.shape).astype('float64'), - df.index, df.columns)) + tm.assert_frame_equal(df, + DataFrame(np.zeros(df.shape).astype('float64'), + df.index, df.columns)) df = DataFrame(0, index=[1, 2, 3], columns=['a', 'b', 'c']) - assert_frame_equal(df, DataFrame(np.zeros(df.shape).astype('int64'), - df.index, df.columns)) + tm.assert_frame_equal(df, DataFrame(np.zeros(df.shape).astype('int64'), + df.index, df.columns)) df = DataFrame('a', index=[1, 2], columns=['a', 'c']) - assert_frame_equal(df, DataFrame(np.array([['a', 'a'], - ['a', 'a']], - dtype=object), - index=[1, 2], - columns=['a', 'c'])) + tm.assert_frame_equal(df, DataFrame(np.array([['a', 'a'], ['a', 'a']], + dtype=object), + index=[1, 2], columns=['a', 'c'])) self.assertRaises(com.PandasError, DataFrame, 'a', [1, 2]) self.assertRaises(com.PandasError, DataFrame, 'a', columns=['a', 'c']) @@ -1217,7 +1255,7 @@ def test_constructor_with_datetimes(self): expected = Series({'int64': 1, datetime64name: 2, objectname: 2}) result.sort_index() expected.sort_index() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # check with ndarray construction ndim==0 (e.g. we are passing a ndim 0 # ndarray with a dtype specified) @@ -1241,7 +1279,7 @@ def test_constructor_with_datetimes(self): result.sort_index() expected = Series(expected) expected.sort_index() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # check with ndarray construction ndim>0 df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', @@ -1250,7 +1288,7 @@ def test_constructor_with_datetimes(self): index=np.arange(10)) result = df.get_dtype_counts() result.sort_index() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # GH 2809 ind = date_range(start="2000-01-01", freq="D", periods=10) @@ -1262,7 +1300,7 @@ def test_constructor_with_datetimes(self): expected = Series({datetime64name: 1}) result.sort_index() expected.sort_index() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # GH 2810 ind = date_range(start="2000-01-01", freq="D", periods=10) @@ -1273,7 +1311,7 @@ def test_constructor_with_datetimes(self): expected = Series({datetime64name: 1, objectname: 1}) result.sort_index() expected.sort_index() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # GH 7594 # don't coerce tz-aware @@ -1283,12 +1321,12 @@ def test_constructor_with_datetimes(self): df = DataFrame({'End Date': dt}, index=[0]) self.assertEqual(df.iat[0, 0], dt) - assert_series_equal(df.dtypes, Series( + tm.assert_series_equal(df.dtypes, Series( {'End Date': 'datetime64[ns, US/Eastern]'})) df = DataFrame([{'End Date': dt}]) self.assertEqual(df.iat[0, 0], dt) - assert_series_equal(df.dtypes, Series( + tm.assert_series_equal(df.dtypes, Series( {'End Date': 'datetime64[ns, US/Eastern]'})) # tz-aware (UTC and other tz's) @@ -1311,196 +1349,17 @@ def test_constructor_with_datetimes(self): {'a': i.to_series(keep_tz=True).reset_index(drop=True)}) df = DataFrame() df['a'] = i - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = DataFrame({'a': i}) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # multiples i_no_tz = date_range('1/1/2011', periods=5, freq='10s') df = DataFrame({'a': i, 'b': i_no_tz}) expected = DataFrame({'a': i.to_series(keep_tz=True) .reset_index(drop=True), 'b': i_no_tz}) - assert_frame_equal(df, expected) - - def test_constructor_with_datetime_tz(self): - - # 8260 - # support datetime64 with tz - - idx = Index(date_range('20130101', periods=3, tz='US/Eastern'), - name='foo') - dr = date_range('20130110', periods=3) - - # construction - df = DataFrame({'A': idx, 'B': dr}) - self.assertTrue(df['A'].dtype, 'M8[ns, US/Eastern') - self.assertTrue(df['A'].name == 'A') - assert_series_equal(df['A'], Series(idx, name='A')) - assert_series_equal(df['B'], Series(dr, name='B')) - - # construction from dict - df2 = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), - B=Timestamp('20130603', tz='CET')), - index=range(5)) - assert_series_equal(df2.dtypes, Series(['datetime64[ns, US/Eastern]', - 'datetime64[ns, CET]'], - index=['A', 'B'])) - - # dtypes - tzframe = DataFrame({'A': date_range('20130101', periods=3), - 'B': date_range('20130101', periods=3, - tz='US/Eastern'), - 'C': date_range('20130101', periods=3, tz='CET')}) - tzframe.iloc[1, 1] = pd.NaT - tzframe.iloc[1, 2] = pd.NaT - result = tzframe.dtypes.sort_index() - expected = Series([np.dtype('datetime64[ns]'), - DatetimeTZDtype('datetime64[ns, US/Eastern]'), - DatetimeTZDtype('datetime64[ns, CET]')], - ['A', 'B', 'C']) - - # concat - df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1) - assert_frame_equal(df2, df3) - - # select_dtypes - result = df3.select_dtypes(include=['datetime64[ns]']) - expected = df3.reindex(columns=[]) - assert_frame_equal(result, expected) - - # this will select based on issubclass, and these are the same class - result = df3.select_dtypes(include=['datetime64[ns, CET]']) - expected = df3 - assert_frame_equal(result, expected) - - # from index - idx2 = date_range('20130101', periods=3, tz='US/Eastern', name='foo') - df2 = DataFrame(idx2) - assert_series_equal(df2['foo'], Series(idx2, name='foo')) - df2 = DataFrame(Series(idx2)) - assert_series_equal(df2['foo'], Series(idx2, name='foo')) - - idx2 = date_range('20130101', periods=3, tz='US/Eastern') - df2 = DataFrame(idx2) - assert_series_equal(df2[0], Series(idx2, name=0)) - df2 = DataFrame(Series(idx2)) - assert_series_equal(df2[0], Series(idx2, name=0)) - - # interleave with object - result = self.tzframe.assign(D='foo').values - expected = np.array([[Timestamp('2013-01-01 00:00:00'), - Timestamp('2013-01-02 00:00:00'), - Timestamp('2013-01-03 00:00:00')], - [Timestamp('2013-01-01 00:00:00-0500', - tz='US/Eastern'), - pd.NaT, - Timestamp('2013-01-03 00:00:00-0500', - tz='US/Eastern')], - [Timestamp('2013-01-01 00:00:00+0100', tz='CET'), - pd.NaT, - Timestamp('2013-01-03 00:00:00+0100', tz='CET')], - ['foo', 'foo', 'foo']], dtype=object).T - self.assert_numpy_array_equal(result, expected) - - # interleave with only datetime64[ns] - result = self.tzframe.values - expected = np.array([[Timestamp('2013-01-01 00:00:00'), - Timestamp('2013-01-02 00:00:00'), - Timestamp('2013-01-03 00:00:00')], - [Timestamp('2013-01-01 00:00:00-0500', - tz='US/Eastern'), - pd.NaT, - Timestamp('2013-01-03 00:00:00-0500', - tz='US/Eastern')], - [Timestamp('2013-01-01 00:00:00+0100', tz='CET'), - pd.NaT, - Timestamp('2013-01-03 00:00:00+0100', - tz='CET')]], dtype=object).T - self.assert_numpy_array_equal(result, expected) - - # astype - expected = np.array([[Timestamp('2013-01-01 00:00:00'), - Timestamp('2013-01-02 00:00:00'), - Timestamp('2013-01-03 00:00:00')], - [Timestamp('2013-01-01 00:00:00-0500', - tz='US/Eastern'), - pd.NaT, - Timestamp('2013-01-03 00:00:00-0500', - tz='US/Eastern')], - [Timestamp('2013-01-01 00:00:00+0100', tz='CET'), - pd.NaT, - Timestamp('2013-01-03 00:00:00+0100', - tz='CET')]], - dtype=object).T - result = self.tzframe.astype(object) - assert_frame_equal(result, DataFrame( - expected, index=self.tzframe.index, columns=self.tzframe.columns)) - - result = self.tzframe.astype('datetime64[ns]') - expected = DataFrame({'A': date_range('20130101', periods=3), - 'B': (date_range('20130101', periods=3, - tz='US/Eastern') - .tz_convert('UTC') - .tz_localize(None)), - 'C': (date_range('20130101', periods=3, - tz='CET') - .tz_convert('UTC') - .tz_localize(None))}) - expected.iloc[1, 1] = pd.NaT - expected.iloc[1, 2] = pd.NaT - assert_frame_equal(result, expected) - - # str formatting - result = self.tzframe.astype(str) - expected = np.array([['2013-01-01', '2013-01-01 00:00:00-05:00', - '2013-01-01 00:00:00+01:00'], - ['2013-01-02', 'NaT', 'NaT'], - ['2013-01-03', '2013-01-03 00:00:00-05:00', - '2013-01-03 00:00:00+01:00']], dtype=object) - self.assert_numpy_array_equal(result, expected) - - result = str(self.tzframe) - self.assertTrue('0 2013-01-01 2013-01-01 00:00:00-05:00 ' - '2013-01-01 00:00:00+01:00' in result) - self.assertTrue('1 2013-01-02 ' - 'NaT NaT' in result) - self.assertTrue('2 2013-01-03 2013-01-03 00:00:00-05:00 ' - '2013-01-03 00:00:00+01:00' in result) - - # setitem - df['C'] = idx - assert_series_equal(df['C'], Series(idx, name='C')) - - df['D'] = 'foo' - df['D'] = idx - assert_series_equal(df['D'], Series(idx, name='D')) - del df['D'] - - # assert that A & C are not sharing the same base (e.g. they - # are copies) - b1 = df._data.blocks[1] - b2 = df._data.blocks[2] - self.assertTrue(b1.values.equals(b2.values)) - self.assertFalse(id(b1.values.values.base) == - id(b2.values.values.base)) - - # with nan - df2 = df.copy() - df2.iloc[1, 1] = pd.NaT - df2.iloc[1, 2] = pd.NaT - result = df2['B'] - assert_series_equal(notnull(result), Series( - [True, False, True], name='B')) - assert_series_equal(df2.dtypes, df.dtypes) - - # set/reset - df = DataFrame({'A': [0, 1, 2]}, index=idx) - result = df.reset_index() - self.assertTrue(result['foo'].dtype, 'M8[ns, US/Eastern') - - result = result.set_index('foo') - tm.assert_index_equal(df.index, idx) + tm.assert_frame_equal(df, expected) def test_constructor_for_list_with_dtypes(self): # TODO(wesm): unused @@ -1523,39 +1382,39 @@ def test_constructor_for_list_with_dtypes(self): df = DataFrame({'a': [2 ** 31, 2 ** 31 + 1]}) result = df.get_dtype_counts() expected = Series({'int64': 1}) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # GH #2751 (construction with no index specified), make sure we cast to # platform values df = DataFrame([1, 2]) result = df.get_dtype_counts() expected = Series({'int64': 1}) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) df = DataFrame([1., 2.]) result = df.get_dtype_counts() expected = Series({'float64': 1}) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) df = DataFrame({'a': [1, 2]}) result = df.get_dtype_counts() expected = Series({'int64': 1}) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) df = DataFrame({'a': [1., 2.]}) result = df.get_dtype_counts() expected = Series({'float64': 1}) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) df = DataFrame({'a': 1}, index=lrange(3)) result = df.get_dtype_counts() expected = Series({'int64': 1}) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) df = DataFrame({'a': 1.}, index=lrange(3)) result = df.get_dtype_counts() expected = Series({'float64': 1}) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # with object list df = DataFrame({'a': [1, 2, 4, 7], 'b': [1.2, 2.3, 5.1, 6.3], @@ -1567,7 +1426,7 @@ def test_constructor_for_list_with_dtypes(self): {'int64': 1, 'float64': 2, datetime64name: 1, objectname: 1}) result.sort_index() expected.sort_index() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_constructor_frame_copy(self): cop = DataFrame(self.frame, copy=True) @@ -1605,7 +1464,8 @@ def check(df): indexer = np.arange(len(df.columns))[isnull(df.columns)] if len(indexer) == 1: - assert_series_equal(df.iloc[:, indexer[0]], df.loc[:, np.nan]) + tm.assert_series_equal(df.iloc[:, indexer[0]], + df.loc[:, np.nan]) # multiple nans should fail else: @@ -1642,17 +1502,17 @@ def test_from_records_to_records(self): # TODO(wesm): unused frame = DataFrame.from_records(arr) # noqa - index = np.arange(len(arr))[::-1] + index = pd.Index(np.arange(len(arr))[::-1]) indexed_frame = DataFrame.from_records(arr, index=index) - self.assert_numpy_array_equal(indexed_frame.index, index) + self.assert_index_equal(indexed_frame.index, index) # without names, it should go to last ditch arr2 = np.zeros((2, 3)) - assert_frame_equal(DataFrame.from_records(arr2), DataFrame(arr2)) + tm.assert_frame_equal(DataFrame.from_records(arr2), DataFrame(arr2)) # wrong length msg = r'Shape of passed values is \(3, 2\), indices imply \(3, 1\)' - with assertRaisesRegexp(ValueError, msg): + with tm.assertRaisesRegexp(ValueError, msg): DataFrame.from_records(arr, index=index[:-1]) indexed_frame = DataFrame.from_records(arr, index='f1') @@ -1683,14 +1543,14 @@ def test_from_records_iterator(self): 'u': np.array([1.0, 3.0], dtype=np.float32), 'y': np.array([2, 4], dtype=np.int64), 'z': np.array([2, 4], dtype=np.int32)}) - assert_frame_equal(df.reindex_like(xp), xp) + tm.assert_frame_equal(df.reindex_like(xp), xp) # no dtypes specified here, so just compare with the default arr = [(1.0, 2), (3.0, 4), (5., 6), (7., 8)] df = DataFrame.from_records(iter(arr), columns=['x', 'y'], nrows=2) - assert_frame_equal(df, xp.reindex( - columns=['x', 'y']), check_dtype=False) + tm.assert_frame_equal(df, xp.reindex(columns=['x', 'y']), + check_dtype=False) def test_from_records_tuples_generator(self): def tuple_generator(length): @@ -1707,7 +1567,7 @@ def tuple_generator(length): generator = tuple_generator(10) result = DataFrame.from_records(generator, columns=columns_names) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_from_records_lists_generator(self): def list_generator(length): @@ -1724,7 +1584,7 @@ def list_generator(length): generator = list_generator(10) result = DataFrame.from_records(generator, columns=columns_names) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_from_records_columns_not_modified(self): tuples = [(1, 2, 3), @@ -1757,7 +1617,7 @@ def test_from_records_duplicates(self): expected = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'a']) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_from_records_set_index_name(self): def create_dict(order_id): @@ -1782,7 +1642,7 @@ def test_from_records_misc_brokenness(self): result = DataFrame.from_records(data, columns=['a', 'b']) exp = DataFrame(data, columns=['a', 'b']) - assert_frame_equal(result, exp) + tm.assert_frame_equal(result, exp) # overlap in index/index_names @@ -1790,7 +1650,7 @@ def test_from_records_misc_brokenness(self): result = DataFrame.from_records(data, index=['a', 'b', 'c']) exp = DataFrame(data, index=['a', 'b', 'c']) - assert_frame_equal(result, exp) + tm.assert_frame_equal(result, exp) # GH 2623 rows = [] @@ -1806,28 +1666,28 @@ def test_from_records_misc_brokenness(self): df2_obj = DataFrame.from_records(rows, columns=['date', 'test']) results = df2_obj.get_dtype_counts() expected = Series({'datetime64[ns]': 1, 'int64': 1}) - assert_series_equal(results, expected) + tm.assert_series_equal(results, expected) def test_from_records_empty(self): # 3562 result = DataFrame.from_records([], columns=['a', 'b', 'c']) expected = DataFrame(columns=['a', 'b', 'c']) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = DataFrame.from_records([], columns=['a', 'b', 'b']) expected = DataFrame(columns=['a', 'b', 'b']) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_from_records_empty_with_nonempty_fields_gh3682(self): a = np.array([(1, 2)], dtype=[('id', np.int64), ('value', np.int64)]) df = DataFrame.from_records(a, index='id') - assert_numpy_array_equal(df.index, Index([1], name='id')) + tm.assert_index_equal(df.index, Index([1], name='id')) self.assertEqual(df.index.name, 'id') - assert_numpy_array_equal(df.columns, Index(['value'])) + tm.assert_index_equal(df.columns, Index(['value'])) b = np.array([], dtype=[('id', np.int64), ('value', np.int64)]) df = DataFrame.from_records(b, index='id') - assert_numpy_array_equal(df.index, Index([], name='id')) + tm.assert_index_equal(df.index, Index([], name='id')) self.assertEqual(df.index.name, 'id') def test_from_records_with_datetimes(self): @@ -1850,14 +1710,14 @@ def test_from_records_with_datetimes(self): raise nose.SkipTest("known failure of numpy rec array creation") result = DataFrame.from_records(recarray) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # coercion should work too arrdata = [np.array([datetime(2005, 3, 1, 0, 0), None])] dtypes = [('EXPIRY', '= 0).all()) self.assertTrue((i.labels[1] >= 0).all()) + def test_where(self): + i = MultiIndex.from_tuples([('A', 1), ('A', 2)]) + + def f(): + i.where(True) + + self.assertRaises(NotImplementedError, f) + def test_repeat(self): reps = 2 numbers = [1, 2, 3] @@ -636,7 +644,7 @@ def test_from_product(self): ('buz', 'c')] expected = MultiIndex.from_tuples(tuples, names=names) - tm.assert_numpy_array_equal(result, expected) + tm.assert_index_equal(result, expected) self.assertEqual(result.names, names) def test_from_product_datetimeindex(self): @@ -673,14 +681,14 @@ def test_append(self): def test_get_level_values(self): result = self.index.get_level_values(0) - expected = ['foo', 'foo', 'bar', 'baz', 'qux', 'qux'] - tm.assert_numpy_array_equal(result, expected) - + expected = Index(['foo', 'foo', 'bar', 'baz', 'qux', 'qux'], + name='first') + tm.assert_index_equal(result, expected) self.assertEqual(result.name, 'first') result = self.index.get_level_values('first') expected = self.index.get_level_values(0) - tm.assert_numpy_array_equal(result, expected) + tm.assert_index_equal(result, expected) # GH 10460 index = MultiIndex(levels=[CategoricalIndex( @@ -695,19 +703,19 @@ def test_get_level_values_na(self): arrays = [['a', 'b', 'b'], [1, np.nan, 2]] index = pd.MultiIndex.from_arrays(arrays) values = index.get_level_values(1) - expected = [1, np.nan, 2] + expected = np.array([1, np.nan, 2]) tm.assert_numpy_array_equal(values.values.astype(float), expected) arrays = [['a', 'b', 'b'], [np.nan, np.nan, 2]] index = pd.MultiIndex.from_arrays(arrays) values = index.get_level_values(1) - expected = [np.nan, np.nan, 2] + expected = np.array([np.nan, np.nan, 2]) tm.assert_numpy_array_equal(values.values.astype(float), expected) arrays = [[np.nan, np.nan, np.nan], ['a', np.nan, 1]] index = pd.MultiIndex.from_arrays(arrays) values = index.get_level_values(0) - expected = [np.nan, np.nan, np.nan] + expected = np.array([np.nan, np.nan, np.nan]) tm.assert_numpy_array_equal(values.values.astype(float), expected) values = index.get_level_values(1) expected = np.array(['a', np.nan, 1], dtype=object) @@ -1023,10 +1031,10 @@ def test_get_indexer(self): idx2 = index[[1, 3, 5]] r1 = idx1.get_indexer(idx2) - assert_almost_equal(r1, [1, 3, -1]) + assert_almost_equal(r1, np.array([1, 3, -1])) r1 = idx2.get_indexer(idx1, method='pad') - e1 = [-1, 0, 0, 1, 1] + e1 = np.array([-1, 0, 0, 1, 1]) assert_almost_equal(r1, e1) r2 = idx2.get_indexer(idx1[::-1], method='pad') @@ -1036,7 +1044,7 @@ def test_get_indexer(self): assert_almost_equal(r1, rffill1) r1 = idx2.get_indexer(idx1, method='backfill') - e1 = [0, 0, 1, 1, 2] + e1 = np.array([0, 0, 1, 1, 2]) assert_almost_equal(r1, e1) r2 = idx2.get_indexer(idx1[::-1], method='backfill') @@ -1056,9 +1064,10 @@ def test_get_indexer(self): # create index with duplicates idx1 = Index(lrange(10) + lrange(10)) idx2 = Index(lrange(20)) - assertRaisesRegexp(InvalidIndexError, "Reindexing only valid with" - " uniquely valued Index objects", idx1.get_indexer, - idx2) + + msg = "Reindexing only valid with uniquely valued Index objects" + with assertRaisesRegexp(InvalidIndexError, msg): + idx1.get_indexer(idx2) def test_get_indexer_nearest(self): midx = MultiIndex.from_tuples([('a', 1), ('b', 2)]) @@ -1516,15 +1525,18 @@ def test_insert(self): # key not contained in all levels new_index = self.index.insert(0, ('abc', 'three')) - tm.assert_numpy_array_equal(new_index.levels[0], - list(self.index.levels[0]) + ['abc']) - tm.assert_numpy_array_equal(new_index.levels[1], - list(self.index.levels[1]) + ['three']) + + exp0 = Index(list(self.index.levels[0]) + ['abc'], name='first') + tm.assert_index_equal(new_index.levels[0], exp0) + + exp1 = Index(list(self.index.levels[1]) + ['three'], name='second') + tm.assert_index_equal(new_index.levels[1], exp1) self.assertEqual(new_index[0], ('abc', 'three')) # key wrong length - assertRaisesRegexp(ValueError, "Item must have length equal to number" - " of levels", self.index.insert, 0, ('foo2', )) + msg = "Item must have length equal to number of levels" + with assertRaisesRegexp(ValueError, msg): + self.index.insert(0, ('foo2', )) left = pd.DataFrame([['a', 'b', 0], ['b', 'd', 1]], columns=['1st', '2nd', '3rd']) @@ -1545,14 +1557,9 @@ def test_insert(self): ts.loc[('a', 'w')] = 5 ts.loc['a', 'a'] = 6 - right = pd.DataFrame([['a', 'b', 0], - ['b', 'd', 1], - ['b', 'x', 2], - ['b', 'a', -1], - ['b', 'b', 3], - ['a', 'x', 4], - ['a', 'w', 5], - ['a', 'a', 6]], + right = pd.DataFrame([['a', 'b', 0], ['b', 'd', 1], ['b', 'x', 2], + ['b', 'a', -1], ['b', 'b', 3], ['a', 'x', 4], + ['a', 'w', 5], ['a', 'a', 6]], columns=['1st', '2nd', '3rd']) right.set_index(['1st', '2nd'], inplace=True) # FIXME data types changes to float because @@ -1993,9 +2000,9 @@ def test_isin(self): def test_isin_nan(self): idx = MultiIndex.from_arrays([['foo', 'bar'], [1.0, np.nan]]) tm.assert_numpy_array_equal(idx.isin([('bar', np.nan)]), - [False, False]) + np.array([False, False])) tm.assert_numpy_array_equal(idx.isin([('bar', float('nan'))]), - [False, False]) + np.array([False, False])) def test_isin_level_kwarg(self): idx = MultiIndex.from_arrays([['qux', 'baz', 'foo', 'bar'], np.arange( diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 8592ae1741a4e..5eac0bc870756 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -158,6 +158,7 @@ def check_is_index(self, i): def check_coerce(self, a, b, is_float_index=True): self.assertTrue(a.equals(b)) + self.assert_index_equal(a, b, exact=False) if is_float_index: self.assertIsInstance(b, Float64Index) else: @@ -259,6 +260,11 @@ def test_astype(self): for dtype in ['M8[ns]', 'm8[ns]']: self.assertRaises(TypeError, lambda: i.astype(dtype)) + # GH 13149 + for dtype in ['int16', 'int32', 'int64']: + i = Float64Index([0, 1.1, np.NAN]) + self.assertRaises(ValueError, lambda: i.astype(dtype)) + def test_equals(self): i = Float64Index([1.0, 2.0]) @@ -277,14 +283,16 @@ def test_equals(self): def test_get_indexer(self): idx = Float64Index([0.0, 1.0, 2.0]) - tm.assert_numpy_array_equal(idx.get_indexer(idx), [0, 1, 2]) + tm.assert_numpy_array_equal(idx.get_indexer(idx), + np.array([0, 1, 2], dtype=np.int_)) target = [-0.1, 0.5, 1.1] - tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), [-1, 0, 1]) - tm.assert_numpy_array_equal( - idx.get_indexer(target, 'backfill'), [0, 1, 2]) - tm.assert_numpy_array_equal( - idx.get_indexer(target, 'nearest'), [0, 1, 1]) + tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), + np.array([-1, 0, 1], dtype=np.int_)) + tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), + np.array([0, 1, 2], dtype=np.int_)) + tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), + np.array([0, 1, 1], dtype=np.int_)) def test_get_loc(self): idx = Float64Index([0.0, 1.0, 2.0]) @@ -353,7 +361,7 @@ def test_astype_from_object(self): index = Index([1.0, np.nan, 0.2], dtype='object') result = index.astype(float) expected = Float64Index([1.0, np.nan, 0.2]) - tm.assert_equal(result.dtype, expected.dtype) + self.assertEqual(result.dtype, expected.dtype) tm.assert_index_equal(result, expected) def test_fillna_float64(self): @@ -420,12 +428,12 @@ def testit(): def test_constructor(self): # pass list, coerce fine index = Int64Index([-5, 0, 1, 2]) - expected = np.array([-5, 0, 1, 2], dtype=np.int64) - tm.assert_numpy_array_equal(index, expected) + expected = Index([-5, 0, 1, 2], dtype=np.int64) + tm.assert_index_equal(index, expected) # from iterable index = Int64Index(iter([-5, 0, 1, 2])) - tm.assert_numpy_array_equal(index, expected) + tm.assert_index_equal(index, expected) # scalar raise Exception self.assertRaises(TypeError, Int64Index, 5) @@ -433,7 +441,7 @@ def test_constructor(self): # copy arr = self.index.values new_index = Int64Index(arr, copy=True) - tm.assert_numpy_array_equal(new_index, self.index) + tm.assert_index_equal(new_index, self.index) val = arr[0] + 3000 # this should not change index @@ -452,7 +460,7 @@ def test_constructor_corner(self): arr = np.array([1, 2, 3, 4], dtype=object) index = Int64Index(arr) self.assertEqual(index.values.dtype, np.int64) - self.assertTrue(index.equals(arr)) + self.assert_index_equal(index, Index(arr)) # preventing casting arr = np.array([1, '2', 3, '4'], dtype=object) @@ -576,7 +584,7 @@ def test_join_outer(self): res, lidx, ridx = self.index.join(other, how='outer', return_indexers=True) noidx_res = self.index.join(other, how='outer') - self.assertTrue(res.equals(noidx_res)) + self.assert_index_equal(res, noidx_res) eres = Int64Index([0, 1, 2, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 25]) elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1], @@ -585,7 +593,7 @@ def test_join_outer(self): dtype=np.int_) tm.assertIsInstance(res, Int64Index) - self.assertTrue(res.equals(eres)) + self.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) tm.assert_numpy_array_equal(ridx, eridx) @@ -593,14 +601,14 @@ def test_join_outer(self): res, lidx, ridx = self.index.join(other_mono, how='outer', return_indexers=True) noidx_res = self.index.join(other_mono, how='outer') - self.assertTrue(res.equals(noidx_res)) + self.assert_index_equal(res, noidx_res) elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1], dtype=np.int64) eridx = np.array([-1, 0, 1, -1, 2, -1, 3, -1, -1, 4, -1, -1, -1, 5], dtype=np.int64) tm.assertIsInstance(res, Int64Index) - self.assertTrue(res.equals(eres)) + self.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) tm.assert_numpy_array_equal(ridx, eridx) @@ -623,7 +631,7 @@ def test_join_inner(self): eridx = np.array([4, 1], dtype=np.int_) tm.assertIsInstance(res, Int64Index) - self.assertTrue(res.equals(eres)) + self.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) tm.assert_numpy_array_equal(ridx, eridx) @@ -632,12 +640,12 @@ def test_join_inner(self): return_indexers=True) res2 = self.index.intersection(other_mono) - self.assertTrue(res.equals(res2)) + self.assert_index_equal(res, res2) elidx = np.array([1, 6], dtype=np.int64) eridx = np.array([1, 4], dtype=np.int64) tm.assertIsInstance(res, Int64Index) - self.assertTrue(res.equals(eres)) + self.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) tm.assert_numpy_array_equal(ridx, eridx) @@ -653,7 +661,7 @@ def test_join_left(self): dtype=np.int_) tm.assertIsInstance(res, Int64Index) - self.assertTrue(res.equals(eres)) + self.assert_index_equal(res, eres) self.assertIsNone(lidx) tm.assert_numpy_array_equal(ridx, eridx) @@ -663,7 +671,7 @@ def test_join_left(self): eridx = np.array([-1, 1, -1, -1, -1, -1, 4, -1, -1, -1], dtype=np.int64) tm.assertIsInstance(res, Int64Index) - self.assertTrue(res.equals(eres)) + self.assert_index_equal(res, eres) self.assertIsNone(lidx) tm.assert_numpy_array_equal(ridx, eridx) @@ -674,7 +682,7 @@ def test_join_left(self): eres = Index([1, 1, 2, 5, 7, 9]) # 1 is in idx2, so it should be x2 eridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) elidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) - self.assertTrue(res.equals(eres)) + self.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) tm.assert_numpy_array_equal(ridx, eridx) @@ -689,7 +697,7 @@ def test_join_right(self): elidx = np.array([-1, 6, -1, -1, 1, -1], dtype=np.int_) tm.assertIsInstance(other, Int64Index) - self.assertTrue(res.equals(eres)) + self.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) self.assertIsNone(ridx) @@ -699,7 +707,7 @@ def test_join_right(self): eres = other_mono elidx = np.array([-1, 1, -1, -1, 6, -1], dtype=np.int64) tm.assertIsInstance(other, Int64Index) - self.assertTrue(res.equals(eres)) + self.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) self.assertIsNone(ridx) @@ -710,7 +718,7 @@ def test_join_right(self): eres = Index([1, 1, 2, 5, 7, 9]) # 1 is in idx2, so it should be x2 elidx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) eridx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) - self.assertTrue(res.equals(eres)) + self.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) tm.assert_numpy_array_equal(ridx, eridx) @@ -719,28 +727,27 @@ def test_join_non_int_index(self): outer = self.index.join(other, how='outer') outer2 = other.join(self.index, how='outer') - expected = Index([0, 2, 3, 4, 6, 7, 8, 10, 12, 14, - 16, 18], dtype=object) - self.assertTrue(outer.equals(outer2)) - self.assertTrue(outer.equals(expected)) + expected = Index([0, 2, 3, 4, 6, 7, 8, 10, 12, 14, 16, 18]) + self.assert_index_equal(outer, outer2) + self.assert_index_equal(outer, expected) inner = self.index.join(other, how='inner') inner2 = other.join(self.index, how='inner') - expected = Index([6, 8, 10], dtype=object) - self.assertTrue(inner.equals(inner2)) - self.assertTrue(inner.equals(expected)) + expected = Index([6, 8, 10]) + self.assert_index_equal(inner, inner2) + self.assert_index_equal(inner, expected) left = self.index.join(other, how='left') - self.assertTrue(left.equals(self.index)) + self.assert_index_equal(left, self.index.astype(object)) left2 = other.join(self.index, how='left') - self.assertTrue(left2.equals(other)) + self.assert_index_equal(left2, other) right = self.index.join(other, how='right') - self.assertTrue(right.equals(other)) + self.assert_index_equal(right, other) right2 = other.join(self.index, how='right') - self.assertTrue(right2.equals(self.index)) + self.assert_index_equal(right2, self.index.astype(object)) def test_join_non_unique(self): left = Index([4, 4, 3, 3]) @@ -748,7 +755,7 @@ def test_join_non_unique(self): joined, lidx, ridx = left.join(left, return_indexers=True) exp_joined = Index([3, 3, 3, 3, 4, 4, 4, 4]) - self.assertTrue(joined.equals(exp_joined)) + self.assert_index_equal(joined, exp_joined) exp_lidx = np.array([2, 2, 3, 3, 0, 0, 1, 1], dtype=np.int_) tm.assert_numpy_array_equal(lidx, exp_lidx) @@ -765,13 +772,14 @@ def test_join_self(self): def test_intersection(self): other = Index([1, 2, 3, 4, 5]) result = self.index.intersection(other) - expected = np.sort(np.intersect1d(self.index.values, other.values)) - tm.assert_numpy_array_equal(result, expected) + expected = Index(np.sort(np.intersect1d(self.index.values, + other.values))) + tm.assert_index_equal(result, expected) result = other.intersection(self.index) - expected = np.sort(np.asarray(np.intersect1d(self.index.values, - other.values))) - tm.assert_numpy_array_equal(result, expected) + expected = Index(np.sort(np.asarray(np.intersect1d(self.index.values, + other.values)))) + tm.assert_index_equal(result, expected) def test_intersect_str_dates(self): dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] @@ -788,12 +796,12 @@ def test_union_noncomparable(self): now = datetime.now() other = Index([now + timedelta(i) for i in range(4)], dtype=object) result = self.index.union(other) - expected = np.concatenate((self.index, other)) - tm.assert_numpy_array_equal(result, expected) + expected = Index(np.concatenate((self.index, other))) + tm.assert_index_equal(result, expected) result = other.union(self.index) - expected = np.concatenate((other, self.index)) - tm.assert_numpy_array_equal(result, expected) + expected = Index(np.concatenate((other, self.index))) + tm.assert_index_equal(result, expected) def test_cant_or_shouldnt_cast(self): # can't diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 8b04b510146d2..99e4b72bcee37 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -102,10 +102,10 @@ def test_constructor_same(self): self.assertTrue(result.identical(index)) result = RangeIndex(index, copy=True) - self.assertTrue(result.equals(index)) + self.assert_index_equal(result, index, exact=True) result = RangeIndex(index) - self.assertTrue(result.equals(index)) + self.assert_index_equal(result, index, exact=True) self.assertRaises(TypeError, lambda: RangeIndex(index, dtype='float64')) @@ -116,24 +116,24 @@ def test_constructor_range(self): result = RangeIndex.from_range(range(1, 5, 2)) expected = RangeIndex(1, 5, 2) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected, exact=True) result = RangeIndex.from_range(range(5, 6)) expected = RangeIndex(5, 6, 1) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected, exact=True) # an invalid range result = RangeIndex.from_range(range(5, 1)) expected = RangeIndex(0, 0, 1) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected, exact=True) result = RangeIndex.from_range(range(5)) expected = RangeIndex(0, 5, 1) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected, exact=True) result = Index(range(1, 5, 2)) expected = RangeIndex(1, 5, 2) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected, exact=True) self.assertRaises(TypeError, lambda: Index(range(1, 5, 2), dtype='float64')) @@ -165,27 +165,28 @@ def test_numeric_compat2(self): result = idx * 2 expected = RangeIndex(0, 20, 4) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected, exact=True) result = idx + 2 expected = RangeIndex(2, 12, 2) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected, exact=True) result = idx - 2 expected = RangeIndex(-2, 8, 2) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected, exact=True) # truediv under PY3 result = idx / 2 + if PY3: - expected = RangeIndex(0, 5, 1) - else: expected = RangeIndex(0, 5, 1).astype('float64') - self.assertTrue(result.equals(expected)) + else: + expected = RangeIndex(0, 5, 1) + self.assert_index_equal(result, expected, exact=True) result = idx / 4 - expected = RangeIndex(0, 10, 2).values / 4 - self.assertTrue(result.equals(expected)) + expected = RangeIndex(0, 10, 2) / 4 + self.assert_index_equal(result, expected, exact=True) result = idx // 1 expected = idx @@ -220,7 +221,7 @@ def test_constructor_corner(self): arr = np.array([1, 2, 3, 4], dtype=object) index = RangeIndex(1, 5) self.assertEqual(index.values.dtype, np.int64) - self.assertTrue(index.equals(arr)) + self.assert_index_equal(index, Index(arr)) # non-int raise Exception self.assertRaises(TypeError, RangeIndex, '1', '10', '1') @@ -249,7 +250,7 @@ def test_repr(self): self.assertTrue(result, expected) result = eval(result) - self.assertTrue(result.equals(i)) + self.assert_index_equal(result, i, exact=True) i = RangeIndex(5, 0, -1) result = repr(i) @@ -257,7 +258,7 @@ def test_repr(self): self.assertEqual(result, expected) result = eval(result) - self.assertTrue(result.equals(i)) + self.assert_index_equal(result, i, exact=True) def test_insert(self): @@ -265,19 +266,19 @@ def test_insert(self): result = idx[1:4] # test 0th element - self.assertTrue(idx[0:4].equals(result.insert(0, idx[0]))) + self.assert_index_equal(idx[0:4], result.insert(0, idx[0])) def test_delete(self): idx = RangeIndex(5, name='Foo') expected = idx[1:].astype(int) result = idx.delete(0) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) expected = idx[:-1].astype(int) result = idx.delete(-1) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) with tm.assertRaises((IndexError, ValueError)): @@ -292,7 +293,7 @@ def test_view(self): self.assertEqual(i_view.name, 'Foo') i_view = i.view('i8') - tm.assert_numpy_array_equal(i, i_view) + tm.assert_numpy_array_equal(i.values, i_view) i_view = i.view(RangeIndex) tm.assert_index_equal(i, i_view) @@ -376,7 +377,7 @@ def test_join_outer(self): res, lidx, ridx = self.index.join(other, how='outer', return_indexers=True) noidx_res = self.index.join(other, how='outer') - self.assertTrue(res.equals(noidx_res)) + self.assert_index_equal(res, noidx_res) eres = Int64Index([0, 2, 4, 6, 8, 10, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]) @@ -387,7 +388,7 @@ def test_join_outer(self): self.assertIsInstance(res, Int64Index) self.assertFalse(isinstance(res, RangeIndex)) - self.assertTrue(res.equals(eres)) + self.assert_index_equal(res, eres) self.assert_numpy_array_equal(lidx, elidx) self.assert_numpy_array_equal(ridx, eridx) @@ -397,11 +398,11 @@ def test_join_outer(self): res, lidx, ridx = self.index.join(other, how='outer', return_indexers=True) noidx_res = self.index.join(other, how='outer') - self.assertTrue(res.equals(noidx_res)) + self.assert_index_equal(res, noidx_res) self.assertIsInstance(res, Int64Index) self.assertFalse(isinstance(res, RangeIndex)) - self.assertTrue(res.equals(eres)) + self.assert_index_equal(res, eres) self.assert_numpy_array_equal(lidx, elidx) self.assert_numpy_array_equal(ridx, eridx) @@ -423,7 +424,7 @@ def test_join_inner(self): eridx = np.array([9, 7]) self.assertIsInstance(res, Int64Index) - self.assertTrue(res.equals(eres)) + self.assert_index_equal(res, eres) self.assert_numpy_array_equal(lidx, elidx) self.assert_numpy_array_equal(ridx, eridx) @@ -434,7 +435,7 @@ def test_join_inner(self): return_indexers=True) self.assertIsInstance(res, RangeIndex) - self.assertTrue(res.equals(eres)) + self.assert_index_equal(res, eres) self.assert_numpy_array_equal(lidx, elidx) self.assert_numpy_array_equal(ridx, eridx) @@ -448,7 +449,7 @@ def test_join_left(self): eridx = np.array([-1, -1, -1, -1, -1, -1, -1, -1, 9, 7], dtype=np.int_) self.assertIsInstance(res, RangeIndex) - self.assertTrue(res.equals(eres)) + self.assert_index_equal(res, eres) self.assertIsNone(lidx) self.assert_numpy_array_equal(ridx, eridx) @@ -459,7 +460,7 @@ def test_join_left(self): return_indexers=True) self.assertIsInstance(res, RangeIndex) - self.assertTrue(res.equals(eres)) + self.assert_index_equal(res, eres) self.assertIsNone(lidx) self.assert_numpy_array_equal(ridx, eridx) @@ -474,7 +475,7 @@ def test_join_right(self): dtype=np.int_) self.assertIsInstance(other, Int64Index) - self.assertTrue(res.equals(eres)) + self.assert_index_equal(res, eres) self.assert_numpy_array_equal(lidx, elidx) self.assertIsNone(ridx) @@ -486,7 +487,7 @@ def test_join_right(self): eres = other self.assertIsInstance(other, RangeIndex) - self.assertTrue(res.equals(eres)) + self.assert_index_equal(res, eres) self.assert_numpy_array_equal(lidx, elidx) self.assertIsNone(ridx) @@ -495,28 +496,27 @@ def test_join_non_int_index(self): outer = self.index.join(other, how='outer') outer2 = other.join(self.index, how='outer') - expected = Index([0, 2, 3, 4, 6, 7, 8, 10, 12, 14, - 16, 18], dtype=object) - self.assertTrue(outer.equals(outer2)) - self.assertTrue(outer.equals(expected)) + expected = Index([0, 2, 3, 4, 6, 7, 8, 10, 12, 14, 16, 18]) + self.assert_index_equal(outer, outer2) + self.assert_index_equal(outer, expected) inner = self.index.join(other, how='inner') inner2 = other.join(self.index, how='inner') - expected = Index([6, 8, 10], dtype=object) - self.assertTrue(inner.equals(inner2)) - self.assertTrue(inner.equals(expected)) + expected = Index([6, 8, 10]) + self.assert_index_equal(inner, inner2) + self.assert_index_equal(inner, expected) left = self.index.join(other, how='left') - self.assertTrue(left.equals(self.index)) + self.assert_index_equal(left, self.index.astype(object)) left2 = other.join(self.index, how='left') - self.assertTrue(left2.equals(other)) + self.assert_index_equal(left2, other) right = self.index.join(other, how='right') - self.assertTrue(right.equals(other)) + self.assert_index_equal(right, other) right2 = other.join(self.index, how='right') - self.assertTrue(right2.equals(self.index)) + self.assert_index_equal(right2, self.index.astype(object)) def test_join_non_unique(self): other = Index([4, 4, 3, 3]) @@ -528,7 +528,7 @@ def test_join_non_unique(self): eridx = np.array([-1, -1, 0, 1, -1, -1, -1, -1, -1, -1, -1], dtype=np.int_) - self.assertTrue(res.equals(eres)) + self.assert_index_equal(res, eres) self.assert_numpy_array_equal(lidx, elidx) self.assert_numpy_array_equal(ridx, eridx) @@ -542,25 +542,28 @@ def test_intersection(self): # intersect with Int64Index other = Index(np.arange(1, 6)) result = self.index.intersection(other) - expected = np.sort(np.intersect1d(self.index.values, other.values)) - self.assert_numpy_array_equal(result, expected) + expected = Index(np.sort(np.intersect1d(self.index.values, + other.values))) + self.assert_index_equal(result, expected) result = other.intersection(self.index) - expected = np.sort(np.asarray(np.intersect1d(self.index.values, - other.values))) - self.assert_numpy_array_equal(result, expected) + expected = Index(np.sort(np.asarray(np.intersect1d(self.index.values, + other.values)))) + self.assert_index_equal(result, expected) # intersect with increasing RangeIndex other = RangeIndex(1, 6) result = self.index.intersection(other) - expected = np.sort(np.intersect1d(self.index.values, other.values)) - self.assert_numpy_array_equal(result, expected) + expected = Index(np.sort(np.intersect1d(self.index.values, + other.values))) + self.assert_index_equal(result, expected) # intersect with decreasing RangeIndex other = RangeIndex(5, 0, -1) result = self.index.intersection(other) - expected = np.sort(np.intersect1d(self.index.values, other.values)) - self.assert_numpy_array_equal(result, expected) + expected = Index(np.sort(np.intersect1d(self.index.values, + other.values))) + self.assert_index_equal(result, expected) def test_intersect_str_dates(self): dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] @@ -577,12 +580,12 @@ def test_union_noncomparable(self): now = datetime.now() other = Index([now + timedelta(i) for i in range(4)], dtype=object) result = self.index.union(other) - expected = np.concatenate((self.index, other)) - self.assert_numpy_array_equal(result, expected) + expected = Index(np.concatenate((self.index, other))) + self.assert_index_equal(result, expected) result = other.union(self.index) - expected = np.concatenate((other, self.index)) - self.assert_numpy_array_equal(result, expected) + expected = Index(np.concatenate((other, self.index))) + self.assert_index_equal(result, expected) def test_union(self): RI = RangeIndex @@ -789,43 +792,43 @@ def test_slice_specialised(self): # slice value completion index = self.index[:] expected = self.index - self.assert_numpy_array_equal(index, expected) + self.assert_index_equal(index, expected) # positive slice values index = self.index[7:10:2] - expected = np.array([14, 18]) - self.assert_numpy_array_equal(index, expected) + expected = Index(np.array([14, 18]), name='foo') + self.assert_index_equal(index, expected) # negative slice values index = self.index[-1:-5:-2] - expected = np.array([18, 14]) - self.assert_numpy_array_equal(index, expected) + expected = Index(np.array([18, 14]), name='foo') + self.assert_index_equal(index, expected) # stop overshoot index = self.index[2:100:4] - expected = np.array([4, 12]) - self.assert_numpy_array_equal(index, expected) + expected = Index(np.array([4, 12]), name='foo') + self.assert_index_equal(index, expected) # reverse index = self.index[::-1] - expected = self.index.values[::-1] - self.assert_numpy_array_equal(index, expected) + expected = Index(self.index.values[::-1], name='foo') + self.assert_index_equal(index, expected) index = self.index[-8::-1] - expected = np.array([4, 2, 0]) - self.assert_numpy_array_equal(index, expected) + expected = Index(np.array([4, 2, 0]), name='foo') + self.assert_index_equal(index, expected) index = self.index[-40::-1] - expected = np.array([]) - self.assert_numpy_array_equal(index, expected) + expected = Index(np.array([], dtype=np.int64), name='foo') + self.assert_index_equal(index, expected) index = self.index[40::-1] - expected = self.index.values[40::-1] - self.assert_numpy_array_equal(index, expected) + expected = Index(self.index.values[40::-1], name='foo') + self.assert_index_equal(index, expected) index = self.index[10::-1] - expected = self.index.values[::-1] - self.assert_numpy_array_equal(index, expected) + expected = Index(self.index.values[::-1], name='foo') + self.assert_index_equal(index, expected) def test_len_specialised(self): diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 53ab9aca03f6c..2cb62a60f885b 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -108,15 +108,17 @@ def test_loc_listlike_dtypes(self): # unique slice res = df.loc[['a', 'b']] - exp = DataFrame({'A': [1, 2], - 'B': [4, 5]}, index=pd.CategoricalIndex(['a', 'b'])) + exp_index = pd.CategoricalIndex(['a', 'b'], + categories=index.categories) + exp = DataFrame({'A': [1, 2], 'B': [4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice res = df.loc[['a', 'a', 'b']] - exp = DataFrame({'A': [1, 1, 2], - 'B': [4, 4, 5]}, - index=pd.CategoricalIndex(['a', 'a', 'b'])) + + exp_index = pd.CategoricalIndex(['a', 'a', 'b'], + categories=index.categories) + exp = DataFrame({'A': [1, 1, 2], 'B': [4, 4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp, check_index_type=True) with tm.assertRaisesRegexp( @@ -194,12 +196,15 @@ def test_ix_categorical_index(self): expect = pd.Series(df.ix[:, 'X'], index=cdf.index, name='X') assert_series_equal(cdf.ix[:, 'X'], expect) + exp_index = pd.CategoricalIndex(list('AB'), categories=['A', 'B', 'C']) expect = pd.DataFrame(df.ix[['A', 'B'], :], columns=cdf.columns, - index=pd.CategoricalIndex(list('AB'))) + index=exp_index) assert_frame_equal(cdf.ix[['A', 'B'], :], expect) + exp_columns = pd.CategoricalIndex(list('XY'), + categories=['X', 'Y', 'Z']) expect = pd.DataFrame(df.ix[:, ['X', 'Y']], index=cdf.index, - columns=pd.CategoricalIndex(list('XY'))) + columns=exp_columns) assert_frame_equal(cdf.ix[:, ['X', 'Y']], expect) # non-unique @@ -209,12 +214,14 @@ def test_ix_categorical_index(self): cdf.index = pd.CategoricalIndex(df.index) cdf.columns = pd.CategoricalIndex(df.columns) + exp_index = pd.CategoricalIndex(list('AA'), categories=['A', 'B']) expect = pd.DataFrame(df.ix['A', :], columns=cdf.columns, - index=pd.CategoricalIndex(list('AA'))) + index=exp_index) assert_frame_equal(cdf.ix['A', :], expect) + exp_columns = pd.CategoricalIndex(list('XX'), categories=['X', 'Y']) expect = pd.DataFrame(df.ix[:, 'X'], index=cdf.index, - columns=pd.CategoricalIndex(list('XX'))) + columns=exp_columns) assert_frame_equal(cdf.ix[:, 'X'], expect) expect = pd.DataFrame(df.ix[['A', 'B'], :], columns=cdf.columns, diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index 2a2f8678694de..29f3889d20bd0 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -538,8 +538,10 @@ def test_slice_float(self): # getitem result = idxr(s)[l] - self.assertTrue(result.equals(expected)) - + if isinstance(s, Series): + self.assert_series_equal(result, expected) + else: + self.assert_frame_equal(result, expected) # setitem s2 = s.copy() idxr(s2)[l] = 0 diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 4b8b5ae2571d0..b86b248ead290 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -20,14 +20,14 @@ MultiIndex, Timestamp, Timedelta) from pandas.util.testing import (assert_almost_equal, assert_series_equal, assert_frame_equal, assert_panel_equal, - assert_attr_equal) + assert_attr_equal, slow) from pandas.formats.printing import pprint_thing from pandas import concat, lib from pandas.core.common import PerformanceWarning import pandas.util.testing as tm from pandas import date_range -from numpy.testing.decorators import slow + _verbose = False @@ -2334,6 +2334,18 @@ def test_multiindex_slicers_non_unique(self): self.assertFalse(result.index.is_unique) assert_frame_equal(result, expected) + # GH12896 + # numpy-implementation dependent bug + ints = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 13, 14, 14, 16, + 17, 18, 19, 200000, 200000] + n = len(ints) + idx = MultiIndex.from_arrays([['a'] * n, ints]) + result = Series([1] * n, index=idx) + result = result.sort_index() + result = result.loc[(slice(None), slice(100000))] + expected = Series([1] * (n - 2), index=idx[:-2]).sort_index() + assert_series_equal(result, expected) + def test_multiindex_slicers_datetimelike(self): # GH 7429 @@ -2913,7 +2925,7 @@ def test_dups_fancy_indexing(self): df.columns = ['a', 'a', 'b'] result = df[['b', 'a']].columns expected = Index(['b', 'a', 'a']) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) # across dtypes df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']], @@ -3817,7 +3829,7 @@ def test_astype_assignment_with_dups(self): index = df.index.copy() df['A'] = df['A'].astype(np.float64) - self.assertTrue(df.index.equals(index)) + self.assert_index_equal(df.index, index) # TODO(wesm): unused variables # result = df.get_dtype_counts().sort_index() @@ -4238,7 +4250,8 @@ def test_series_partial_set_period(self): pd.Period('2011-01-03', freq='D')] exp = Series([np.nan, 0.2, np.nan], index=pd.PeriodIndex(keys, name='idx'), name='s') - assert_series_equal(ser.loc[keys], exp, check_index_type=True) + result = ser.loc[keys] + assert_series_equal(result, exp) def test_partial_set_invalid(self): diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py index 574dcd54933ae..2ddfa27eea377 100644 --- a/pandas/tests/series/test_alter_axes.py +++ b/pandas/tests/series/test_alter_axes.py @@ -48,7 +48,7 @@ def test_rename(self): # partial dict s = Series(np.arange(4), index=['a', 'b', 'c', 'd'], dtype='int64') renamed = s.rename({'b': 'foo', 'd': 'bar'}) - self.assert_numpy_array_equal(renamed.index, ['a', 'foo', 'c', 'bar']) + self.assert_index_equal(renamed.index, Index(['a', 'foo', 'c', 'bar'])) # index with name renamer = Series(np.arange(4), @@ -141,7 +141,7 @@ def test_reset_index(self): self.assertEqual(len(rs.columns), 2) rs = s.reset_index(level=[0, 2], drop=True) - self.assertTrue(rs.index.equals(Index(index.get_level_values(1)))) + self.assert_index_equal(rs.index, Index(index.get_level_values(1))) tm.assertIsInstance(rs, Series) def test_reset_index_range(self): diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 002b7fa3aa8df..433f0f4bc67f5 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -289,8 +289,8 @@ def test_argsort_stable(self): mexpected = np.argsort(s.values, kind='mergesort') qexpected = np.argsort(s.values, kind='quicksort') - self.assert_numpy_array_equal(mindexer, mexpected) - self.assert_numpy_array_equal(qindexer, qexpected) + self.assert_series_equal(mindexer, Series(mexpected)) + self.assert_series_equal(qindexer, Series(qexpected)) self.assertFalse(np.array_equal(qindexer, mindexer)) def test_cumsum(self): @@ -300,24 +300,24 @@ def test_cumprod(self): self._check_accum_op('cumprod') def test_cummin(self): - self.assert_numpy_array_equal(self.ts.cummin(), + self.assert_numpy_array_equal(self.ts.cummin().values, np.minimum.accumulate(np.array(self.ts))) ts = self.ts.copy() ts[::2] = np.NaN result = ts.cummin()[1::2] expected = np.minimum.accumulate(ts.valid()) - self.assert_numpy_array_equal(result, expected) + self.assert_series_equal(result, expected) def test_cummax(self): - self.assert_numpy_array_equal(self.ts.cummax(), + self.assert_numpy_array_equal(self.ts.cummax().values, np.maximum.accumulate(np.array(self.ts))) ts = self.ts.copy() ts[::2] = np.NaN result = ts.cummax()[1::2] expected = np.maximum.accumulate(ts.valid()) - self.assert_numpy_array_equal(result, expected) + self.assert_series_equal(result, expected) def test_cummin_datetime64(self): s = pd.Series(pd.to_datetime(['NaT', '2000-1-2', 'NaT', '2000-1-1', @@ -489,7 +489,8 @@ def testit(): def _check_accum_op(self, name): func = getattr(np, name) - self.assert_numpy_array_equal(func(self.ts), func(np.array(self.ts))) + self.assert_numpy_array_equal(func(self.ts).values, + func(np.array(self.ts))) # with missing values ts = self.ts.copy() @@ -498,7 +499,7 @@ def _check_accum_op(self, name): result = func(ts)[1::2] expected = func(np.array(ts.valid())) - self.assert_numpy_array_equal(result, expected) + self.assert_numpy_array_equal(result.values, expected) def test_compress(self): cond = [True, False, True, False, False] @@ -1279,6 +1280,7 @@ def test_idxmax(self): self.assertEqual(result, 1.1) def test_numpy_argmax(self): + # argmax is aliased to idxmax data = np.random.randint(0, 11, size=10) result = np.argmax(Series(data)) @@ -1355,7 +1357,7 @@ def test_searchsorted_numeric_dtypes_scalar(self): s = Series([1, 2, 90, 1000, 3e9]) r = s.searchsorted(30) e = 2 - tm.assert_equal(r, e) + self.assertEqual(r, e) r = s.searchsorted([30]) e = np.array([2], dtype=np.int64) @@ -1372,7 +1374,7 @@ def test_search_sorted_datetime64_scalar(self): v = pd.Timestamp('20120102') r = s.searchsorted(v) e = 1 - tm.assert_equal(r, e) + self.assertEqual(r, e) def test_search_sorted_datetime64_list(self): s = Series(pd.date_range('20120101', periods=10, freq='2D')) @@ -1395,6 +1397,23 @@ def test_is_unique(self): s = Series(np.arange(1000)) self.assertTrue(s.is_unique) + def test_is_monotonic(self): + + s = Series(np.random.randint(0, 10, size=1000)) + self.assertFalse(s.is_monotonic) + s = Series(np.arange(1000)) + self.assertTrue(s.is_monotonic) + self.assertTrue(s.is_monotonic_increasing) + s = Series(np.arange(1000, 0, -1)) + self.assertTrue(s.is_monotonic_decreasing) + + s = Series(pd.date_range('20130101', periods=10)) + self.assertTrue(s.is_monotonic) + self.assertTrue(s.is_monotonic_increasing) + s = Series(list(reversed(s.tolist()))) + self.assertFalse(s.is_monotonic) + self.assertTrue(s.is_monotonic_decreasing) + def test_sort_values(self): ts = self.ts.copy() @@ -1403,13 +1422,13 @@ def test_sort_values(self): with tm.assert_produces_warning(FutureWarning): ts.sort() - self.assert_numpy_array_equal(ts, self.ts.sort_values()) - self.assert_numpy_array_equal(ts.index, self.ts.sort_values().index) + self.assert_series_equal(ts, self.ts.sort_values()) + self.assert_index_equal(ts.index, self.ts.sort_values().index) ts.sort_values(ascending=False, inplace=True) - self.assert_numpy_array_equal(ts, self.ts.sort_values(ascending=False)) - self.assert_numpy_array_equal(ts.index, self.ts.sort_values( - ascending=False).index) + self.assert_series_equal(ts, self.ts.sort_values(ascending=False)) + self.assert_index_equal(ts.index, + self.ts.sort_values(ascending=False).index) # GH 5856/5853 # Series.sort_values operating on a view @@ -1512,11 +1531,11 @@ def test_order(self): result = ts.sort_values() self.assertTrue(np.isnan(result[-5:]).all()) - self.assert_numpy_array_equal(result[:-5], np.sort(vals[5:])) + self.assert_numpy_array_equal(result[:-5].values, np.sort(vals[5:])) result = ts.sort_values(na_position='first') self.assertTrue(np.isnan(result[:5]).all()) - self.assert_numpy_array_equal(result[5:], np.sort(vals[5:])) + self.assert_numpy_array_equal(result[5:].values, np.sort(vals[5:])) # something object-type ser = Series(['A', 'B'], [1, 2]) diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 6e0a0175b403f..26fc80c3ef988 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -160,7 +160,7 @@ def test_map(self): # function result = self.ts.map(lambda x: x * 2) - self.assert_numpy_array_equal(result, self.ts * 2) + self.assert_series_equal(result, self.ts * 2) # GH 10324 a = Series([1, 2, 3, 4]) @@ -187,7 +187,8 @@ def test_map(self): index=pd.CategoricalIndex(['b', 'c', 'd', 'e'])) c = Series(['B', 'C', 'D', 'E'], index=Index(['b', 'c', 'd', 'e'])) - exp = Series([np.nan, 'B', 'C', 'D'], dtype='category') + exp = Series(pd.Categorical([np.nan, 'B', 'C', 'D'], + categories=['B', 'C', 'D', 'E'])) self.assert_series_equal(a.map(b), exp) exp = Series([np.nan, 'B', 'C', 'D']) self.assert_series_equal(a.map(c), exp) diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index 72f1cac219998..eb560d4a17055 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -49,14 +49,14 @@ def test_combine_first(self): # nothing used from the input combined = series.combine_first(series_copy) - self.assert_numpy_array_equal(combined, series) + self.assert_series_equal(combined, series) # Holes filled from input combined = series_copy.combine_first(series) self.assertTrue(np.isfinite(combined).all()) - self.assert_numpy_array_equal(combined[::2], series[::2]) - self.assert_numpy_array_equal(combined[1::2], series_copy[1::2]) + self.assert_series_equal(combined[::2], series[::2]) + self.assert_series_equal(combined[1::2], series_copy[1::2]) # mixed types index = tm.makeStringIndex(20) @@ -65,8 +65,9 @@ def test_combine_first(self): combined = strings.combine_first(floats) - tm.assert_dict_equal(strings, combined, compare_keys=False) - tm.assert_dict_equal(floats[1::2], combined, compare_keys=False) + tm.assert_series_equal(strings, combined.loc[index[::2]]) + tm.assert_series_equal(floats[1::2].astype(object), + combined.loc[index[1::2]]) # corner case s = Series([1., 2, 3], index=[0, 1, 2]) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 68733700e1483..a80a3af56b18f 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -137,7 +137,7 @@ def test_constructor_categorical(self): cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'], fastpath=True) res = Series(cat) - self.assertTrue(res.values.equals(cat)) + tm.assert_categorical_equal(res.values, cat) # GH12574 self.assertRaises( @@ -418,8 +418,10 @@ def test_constructor_with_datetime_tz(self): result = s.values self.assertIsInstance(result, np.ndarray) self.assertTrue(result.dtype == 'datetime64[ns]') - self.assertTrue(dr.equals(pd.DatetimeIndex(result).tz_localize( - 'UTC').tz_convert(tz=s.dt.tz))) + + exp = pd.DatetimeIndex(result) + exp = exp.tz_localize('UTC').tz_convert(tz=s.dt.tz) + self.assert_index_equal(dr, exp) # indexing result = s.iloc[0] diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index 5b12baf6c6fc5..6e82f81f901a9 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -320,8 +320,6 @@ def test_strftime(self): expected = np.array(['2015/03/01', '2015/03/02', '2015/03/03', '2015/03/04', '2015/03/05'], dtype=np.object_) # dtype may be S10 or U10 depending on python version - print(result) - print(expected) self.assert_numpy_array_equal(result, expected, check_dtype=False) period_index = period_range('20150301', periods=5) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index fc963d4597246..5194a29bc8b42 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -55,7 +55,7 @@ def test_astype_cast_object_int(self): arr = Series(['1', '2', '3', '4'], dtype=object) result = arr.astype(int) - self.assert_numpy_array_equal(result, np.arange(1, 5)) + self.assert_series_equal(result, Series(np.arange(1, 5))) def test_astype_datetimes(self): import pandas.tslib as tslib @@ -133,6 +133,21 @@ def test_astype_unicode(self): reload(sys) # noqa sys.setdefaultencoding(former_encoding) + def test_astype_dict(self): + s = Series(range(0, 10, 2), name='abc') + + result = s.astype({'abc': str}) + expected = Series(['0', '2', '4', '6', '8'], name='abc') + assert_series_equal(result, expected) + + result = s.astype({'abc': 'float64'}) + expected = Series([0.0, 2.0, 4.0, 6.0, 8.0], dtype='float64', + name='abc') + assert_series_equal(result, expected) + + self.assertRaises(KeyError, s.astype, {'abc': str, 'def': str}) + self.assertRaises(KeyError, s.astype, {0: str}) + def test_complexx(self): # GH4819 # complex access for ndarray compat diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index 5ed3fda7d0b8f..d01ac3e1aef42 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -246,7 +246,7 @@ def test_getitem_boolean(self): result = s[list(mask)] expected = s[mask] assert_series_equal(result, expected) - self.assert_numpy_array_equal(result.index, s.index[mask]) + self.assert_index_equal(result.index, s.index[mask]) def test_getitem_boolean_empty(self): s = Series([], dtype=np.int64) @@ -287,6 +287,16 @@ def test_getitem_generator(self): assert_series_equal(result, expected) assert_series_equal(result2, expected) + def test_type_promotion(self): + # GH12599 + s = pd.Series() + s["a"] = pd.Timestamp("2016-01-01") + s["b"] = 3.0 + s["c"] = "foo" + expected = Series([pd.Timestamp("2016-01-01"), 3.0, "foo"], + index=["a", "b", "c"]) + assert_series_equal(s, expected) + def test_getitem_boolean_object(self): # using column from DataFrame diff --git a/pandas/tests/series/test_internals.py b/pandas/tests/series/test_internals.py index 93bd7f0eec7c5..e3a0e056f4da1 100644 --- a/pandas/tests/series/test_internals.py +++ b/pandas/tests/series/test_internals.py @@ -103,7 +103,8 @@ def test_convert_objects(self): with tm.assert_produces_warning(FutureWarning): result = s.convert_objects(convert_dates='coerce', convert_numeric=False) - assert_series_equal(result, s) + expected = Series([lib.NaT] * 2 + [Timestamp(1)] * 2) + assert_series_equal(result, expected) # preserver if non-object s = Series([1], dtype='float32') @@ -270,7 +271,7 @@ def test_convert(self): s = Series(['foo', 'bar', 1, 1.0], dtype='O') result = s._convert(datetime=True, coerce=True) - expected = Series([lib.NaT] * 4) + expected = Series([lib.NaT] * 2 + [Timestamp(1)] * 2) assert_series_equal(result, expected) # preserver if non-object diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 4fda1152abd96..f89501d39f014 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -130,7 +130,7 @@ def test_to_frame(self): assert_frame_equal(rs, xp) def test_to_dict(self): - self.assert_numpy_array_equal(Series(self.ts.to_dict()), self.ts) + self.assert_series_equal(Series(self.ts.to_dict(), name='ts'), self.ts) def test_timeseries_periodindex(self): # GH2891 diff --git a/pandas/tests/series/test_misc_api.py b/pandas/tests/series/test_misc_api.py index 9f5433782b062..d74966738909d 100644 --- a/pandas/tests/series/test_misc_api.py +++ b/pandas/tests/series/test_misc_api.py @@ -206,7 +206,7 @@ def test_keys(self): self.assertIs(getkeys(), self.ts.index) def test_values(self): - self.assert_numpy_array_equal(self.ts, self.ts.values) + self.assert_almost_equal(self.ts.values, self.ts, check_dtype=False) def test_iteritems(self): for idx, val in compat.iteritems(self.series): diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index dec4f878d7d56..ed10f5b0a7af3 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -247,16 +247,18 @@ def test_isnull_for_inf(self): def test_fillna(self): ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5)) - self.assert_numpy_array_equal(ts, ts.fillna(method='ffill')) + self.assert_series_equal(ts, ts.fillna(method='ffill')) ts[2] = np.NaN - self.assert_numpy_array_equal(ts.fillna(method='ffill'), - [0., 1., 1., 3., 4.]) - self.assert_numpy_array_equal(ts.fillna(method='backfill'), - [0., 1., 3., 3., 4.]) + exp = Series([0., 1., 1., 3., 4.], index=ts.index) + self.assert_series_equal(ts.fillna(method='ffill'), exp) - self.assert_numpy_array_equal(ts.fillna(value=5), [0., 1., 5., 3., 4.]) + exp = Series([0., 1., 3., 3., 4.], index=ts.index) + self.assert_series_equal(ts.fillna(method='backfill'), exp) + + exp = Series([0., 1., 5., 3., 4.], index=ts.index) + self.assert_series_equal(ts.fillna(value=5), exp) self.assertRaises(ValueError, ts.fillna) self.assertRaises(ValueError, self.ts.fillna, value=0, method='ffill') @@ -433,8 +435,8 @@ def test_valid(self): result = ts.valid() self.assertEqual(len(result), ts.count()) - - tm.assert_dict_equal(result, ts, compare_keys=False) + tm.assert_series_equal(result, ts[1::2]) + tm.assert_series_equal(result, ts[pd.notnull(ts)]) def test_isnull(self): ser = Series([0, 5.4, 3, nan, -0.001]) @@ -488,7 +490,7 @@ def test_interpolate(self): ts_copy[5:10] = np.NaN linear_interp = ts_copy.interpolate(method='linear') - self.assert_numpy_array_equal(linear_interp, ts) + self.assert_series_equal(linear_interp, ts) ord_ts = Series([d.toordinal() for d in self.ts.index], index=self.ts.index).astype(float) @@ -497,7 +499,7 @@ def test_interpolate(self): ord_ts_copy[5:10] = np.NaN time_interp = ord_ts_copy.interpolate(method='time') - self.assert_numpy_array_equal(time_interp, ord_ts) + self.assert_series_equal(time_interp, ord_ts) # try time interpolation on a non-TimeSeries # Only raises ValueError if there are NaNs. diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index c5ef969d3b39d..1e23c87fdb4ca 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -264,6 +264,18 @@ def test_operators_timedelta64(self): rs[2] += np.timedelta64(timedelta(minutes=5, seconds=1)) self.assertEqual(rs[2], value) + def test_operator_series_comparison_zerorank(self): + # GH 13006 + result = np.float64(0) > pd.Series([1, 2, 3]) + expected = 0.0 > pd.Series([1, 2, 3]) + self.assert_series_equal(result, expected) + result = pd.Series([1, 2, 3]) < np.float64(0) + expected = pd.Series([1, 2, 3]) < 0.0 + self.assert_series_equal(result, expected) + result = np.array([0, 1, 2])[0] > pd.Series([0, 1, 2]) + expected = 0.0 > pd.Series([1, 2, 3]) + self.assert_series_equal(result, expected) + def test_timedeltas_with_DateOffset(self): # GH 4532 @@ -1227,8 +1239,9 @@ def test_operators_corner(self): # float + int int_ts = self.ts.astype(int)[:-5] added = self.ts + int_ts - expected = self.ts.values[:-5] + int_ts.values - self.assert_numpy_array_equal(added[:-5], expected) + expected = Series(self.ts.values[:-5] + int_ts.values, + index=self.ts.index[:-5], name='ts') + self.assert_series_equal(added[:-5], expected) def test_operators_reverse_object(self): # GH 56 diff --git a/pandas/tests/series/test_quantile.py b/pandas/tests/series/test_quantile.py index f538fa4e90401..e0bff7fbd39e4 100644 --- a/pandas/tests/series/test_quantile.py +++ b/pandas/tests/series/test_quantile.py @@ -126,6 +126,14 @@ def test_quantile_interpolation_np_lt_1p9(self): interpolation='higher') def test_quantile_nan(self): + + # GH 13098 + s = pd.Series([1, 2, 3, 4, np.nan]) + result = s.quantile(0.5) + expected = 2.5 + self.assertEqual(result, expected) + + # all nan/empty cases = [Series([]), Series([np.nan, np.nan])] for s in cases: diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index ee06bc2c3dd4e..13b95ea97eedf 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -25,7 +25,10 @@ def test_shift(self): shifted = self.ts.shift(1) unshifted = shifted.shift(-1) - tm.assert_dict_equal(unshifted.valid(), self.ts, compare_keys=False) + tm.assert_index_equal(shifted.index, self.ts.index) + tm.assert_index_equal(unshifted.index, self.ts.index) + tm.assert_numpy_array_equal(unshifted.valid().values, + self.ts.values[:-1]) offset = datetools.bday shifted = self.ts.shift(1, freq=offset) @@ -49,7 +52,9 @@ def test_shift(self): ps = tm.makePeriodSeries() shifted = ps.shift(1) unshifted = shifted.shift(-1) - tm.assert_dict_equal(unshifted.valid(), ps, compare_keys=False) + tm.assert_index_equal(shifted.index, ps.index) + tm.assert_index_equal(unshifted.index, ps.index) + tm.assert_numpy_array_equal(unshifted.valid().values, ps.values[:-1]) shifted2 = ps.shift(1, 'B') shifted3 = ps.shift(1, datetools.bday) @@ -77,16 +82,16 @@ def test_shift(self): # xref 8260 # with tz - s = Series( - date_range('2000-01-01 09:00:00', periods=5, - tz='US/Eastern'), name='foo') + s = Series(date_range('2000-01-01 09:00:00', periods=5, + tz='US/Eastern'), name='foo') result = s - s.shift() - assert_series_equal(result, Series( - TimedeltaIndex(['NaT'] + ['1 days'] * 4), name='foo')) + + exp = Series(TimedeltaIndex(['NaT'] + ['1 days'] * 4), name='foo') + assert_series_equal(result, exp) # incompat tz - s2 = Series( - date_range('2000-01-01 09:00:00', periods=5, tz='CET'), name='foo') + s2 = Series(date_range('2000-01-01 09:00:00', periods=5, + tz='CET'), name='foo') self.assertRaises(ValueError, lambda: s - s2) def test_tshift(self): @@ -346,8 +351,10 @@ def test_getitem_setitem_datetime_tz_dateutil(self): from pandas import date_range N = 50 + # testing with timezone, GH #2785 - rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern') + rng = date_range('1/1/1990', periods=N, freq='H', + tz='America/New_York') ts = Series(np.random.randn(N), index=rng) # also test Timestamp tz handling, GH #2789 @@ -368,8 +375,8 @@ def test_getitem_setitem_datetime_tz_dateutil(self): assert_series_equal(result, ts) result = ts.copy() - result[datetime(1990, 1, 1, 3, tzinfo=tz('US/Central'))] = 0 - result[datetime(1990, 1, 1, 3, tzinfo=tz('US/Central'))] = ts[4] + result[datetime(1990, 1, 1, 3, tzinfo=tz('America/Chicago'))] = 0 + result[datetime(1990, 1, 1, 3, tzinfo=tz('America/Chicago'))] = ts[4] assert_series_equal(result, ts) def test_getitem_setitem_periodindex(self): @@ -485,15 +492,15 @@ def test_asfreq(self): daily_ts = ts.asfreq('B') monthly_ts = daily_ts.asfreq('BM') - self.assert_numpy_array_equal(monthly_ts, ts) + self.assert_series_equal(monthly_ts, ts) daily_ts = ts.asfreq('B', method='pad') monthly_ts = daily_ts.asfreq('BM') - self.assert_numpy_array_equal(monthly_ts, ts) + self.assert_series_equal(monthly_ts, ts) daily_ts = ts.asfreq(datetools.bday) monthly_ts = daily_ts.asfreq(datetools.bmonthEnd) - self.assert_numpy_array_equal(monthly_ts, ts) + self.assert_series_equal(monthly_ts, ts) result = ts[:0].asfreq('M') self.assertEqual(len(result), 0) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 917f108711d09..8af93ad0ecb2e 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -3,15 +3,20 @@ import numpy as np from numpy.random import RandomState +from numpy import nan +import datetime -from pandas.core.api import Series, Categorical, CategoricalIndex +from pandas import Series, Categorical, CategoricalIndex, Index import pandas as pd from pandas import compat +import pandas.algos as _algos +from pandas.compat import lrange import pandas.core.algorithms as algos import pandas.util.testing as tm import pandas.hashtable as hashtable from pandas.compat.numpy import np_array_datetime64_compat +from pandas.util.testing import assert_almost_equal class TestMatch(tm.TestCase): @@ -102,14 +107,14 @@ def test_mixed(self): exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.int_) self.assert_numpy_array_equal(labels, exp) - exp = np.array(['A', 'B', 3.14, np.inf], dtype=object) - self.assert_numpy_array_equal(uniques, exp) + exp = pd.Index(['A', 'B', 3.14, np.inf]) + tm.assert_index_equal(uniques, exp) labels, uniques = algos.factorize(x, sort=True) exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.int_) self.assert_numpy_array_equal(labels, exp) - exp = np.array([3.14, np.inf, 'A', 'B'], dtype=object) - self.assert_numpy_array_equal(uniques, exp) + exp = pd.Index([3.14, np.inf, 'A', 'B']) + tm.assert_index_equal(uniques, exp) def test_datelike(self): @@ -121,14 +126,14 @@ def test_datelike(self): exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.int_) self.assert_numpy_array_equal(labels, exp) - exp = np.array([v1.value, v2.value], dtype='M8[ns]') - self.assert_numpy_array_equal(uniques, exp) + exp = pd.DatetimeIndex([v1, v2]) + self.assert_index_equal(uniques, exp) labels, uniques = algos.factorize(x, sort=True) exp = np.array([1, 1, 1, 0, 0, 1], dtype=np.int_) self.assert_numpy_array_equal(labels, exp) - exp = np.array([v2.value, v1.value], dtype='M8[ns]') - self.assert_numpy_array_equal(uniques, exp) + exp = pd.DatetimeIndex([v2, v1]) + self.assert_index_equal(uniques, exp) # period v1 = pd.Period('201302', freq='M') @@ -139,12 +144,12 @@ def test_datelike(self): labels, uniques = algos.factorize(x) exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.int_) self.assert_numpy_array_equal(labels, exp) - self.assert_numpy_array_equal(uniques, pd.PeriodIndex([v1, v2])) + self.assert_index_equal(uniques, pd.PeriodIndex([v1, v2])) labels, uniques = algos.factorize(x, sort=True) exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.int_) self.assert_numpy_array_equal(labels, exp) - self.assert_numpy_array_equal(uniques, pd.PeriodIndex([v1, v2])) + self.assert_index_equal(uniques, pd.PeriodIndex([v1, v2])) # GH 5986 v1 = pd.to_timedelta('1 day 1 min') @@ -153,12 +158,12 @@ def test_datelike(self): labels, uniques = algos.factorize(x) exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.int_) self.assert_numpy_array_equal(labels, exp) - self.assert_numpy_array_equal(uniques, pd.to_timedelta([v1, v2])) + self.assert_index_equal(uniques, pd.to_timedelta([v1, v2])) labels, uniques = algos.factorize(x, sort=True) exp = np.array([1, 0, 1, 1, 0, 0, 1], dtype=np.int_) self.assert_numpy_array_equal(labels, exp) - self.assert_numpy_array_equal(uniques, pd.to_timedelta([v2, v1])) + self.assert_index_equal(uniques, pd.to_timedelta([v2, v1])) def test_factorize_nan(self): # nan should map to na_sentinel, not reverse_indexer[na_sentinel] @@ -580,7 +585,7 @@ def test_group_var_generic_1d(self): expected_counts = counts + 3 self.algo(out, counts, values, labels) - np.testing.assert_allclose(out, expected_out, self.rtol) + self.assertTrue(np.allclose(out, expected_out, self.rtol)) tm.assert_numpy_array_equal(counts, expected_counts) def test_group_var_generic_1d_flat_labels(self): @@ -596,7 +601,7 @@ def test_group_var_generic_1d_flat_labels(self): self.algo(out, counts, values, labels) - np.testing.assert_allclose(out, expected_out, self.rtol) + self.assertTrue(np.allclose(out, expected_out, self.rtol)) tm.assert_numpy_array_equal(counts, expected_counts) def test_group_var_generic_2d_all_finite(self): @@ -611,7 +616,7 @@ def test_group_var_generic_2d_all_finite(self): expected_counts = counts + 2 self.algo(out, counts, values, labels) - np.testing.assert_allclose(out, expected_out, self.rtol) + self.assertTrue(np.allclose(out, expected_out, self.rtol)) tm.assert_numpy_array_equal(counts, expected_counts) def test_group_var_generic_2d_some_nan(self): @@ -626,11 +631,11 @@ def test_group_var_generic_2d_some_nan(self): expected_out = np.vstack([values[:, 0] .reshape(5, 2, order='F') .std(ddof=1, axis=1) ** 2, - np.nan * np.ones(5)]).T + np.nan * np.ones(5)]).T.astype(self.dtype) expected_counts = counts + 2 self.algo(out, counts, values, labels) - np.testing.assert_allclose(out, expected_out, self.rtol) + tm.assert_almost_equal(out, expected_out, check_less_precise=6) tm.assert_numpy_array_equal(counts, expected_counts) def test_group_var_constant(self): @@ -705,6 +710,315 @@ def test_unique_label_indices(): tm.assert_numpy_array_equal(left, right) +def test_rank(): + tm._skip_if_no_scipy() + from scipy.stats import rankdata + + def _check(arr): + mask = ~np.isfinite(arr) + arr = arr.copy() + result = _algos.rank_1d_float64(arr) + arr[mask] = np.inf + exp = rankdata(arr) + exp[mask] = nan + assert_almost_equal(result, exp) + + _check(np.array([nan, nan, 5., 5., 5., nan, 1, 2, 3, nan])) + _check(np.array([4., nan, 5., 5., 5., nan, 1, 2, 4., nan])) + + +def test_pad_backfill_object_segfault(): + + old = np.array([], dtype='O') + new = np.array([datetime.datetime(2010, 12, 31)], dtype='O') + + result = _algos.pad_object(old, new) + expected = np.array([-1], dtype=np.int64) + assert (np.array_equal(result, expected)) + + result = _algos.pad_object(new, old) + expected = np.array([], dtype=np.int64) + assert (np.array_equal(result, expected)) + + result = _algos.backfill_object(old, new) + expected = np.array([-1], dtype=np.int64) + assert (np.array_equal(result, expected)) + + result = _algos.backfill_object(new, old) + expected = np.array([], dtype=np.int64) + assert (np.array_equal(result, expected)) + + +def test_arrmap(): + values = np.array(['foo', 'foo', 'bar', 'bar', 'baz', 'qux'], dtype='O') + result = _algos.arrmap_object(values, lambda x: x in ['foo', 'bar']) + assert (result.dtype == np.bool_) + + +class TestTseriesUtil(tm.TestCase): + _multiprocess_can_split_ = True + + def test_combineFunc(self): + pass + + def test_reindex(self): + pass + + def test_isnull(self): + pass + + def test_groupby(self): + pass + + def test_groupby_withnull(self): + pass + + def test_backfill(self): + old = Index([1, 5, 10]) + new = Index(lrange(12)) + + filler = _algos.backfill_int64(old.values, new.values) + + expect_filler = np.array([0, 0, 1, 1, 1, 1, + 2, 2, 2, 2, 2, -1], dtype=np.int64) + self.assert_numpy_array_equal(filler, expect_filler) + + # corner case + old = Index([1, 4]) + new = Index(lrange(5, 10)) + filler = _algos.backfill_int64(old.values, new.values) + + expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.int64) + self.assert_numpy_array_equal(filler, expect_filler) + + def test_pad(self): + old = Index([1, 5, 10]) + new = Index(lrange(12)) + + filler = _algos.pad_int64(old.values, new.values) + + expect_filler = np.array([-1, 0, 0, 0, 0, 1, + 1, 1, 1, 1, 2, 2], dtype=np.int64) + self.assert_numpy_array_equal(filler, expect_filler) + + # corner case + old = Index([5, 10]) + new = Index(lrange(5)) + filler = _algos.pad_int64(old.values, new.values) + expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.int64) + self.assert_numpy_array_equal(filler, expect_filler) + + +def test_left_join_indexer_unique(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([2, 2, 3, 4, 4], dtype=np.int64) + + result = _algos.left_join_indexer_unique_int64(b, a) + expected = np.array([1, 1, 2, 3, 3], dtype=np.int64) + assert (np.array_equal(result, expected)) + + +def test_left_outer_join_bug(): + left = np.array([0, 1, 0, 1, 1, 2, 3, 1, 0, 2, 1, 2, 0, 1, 1, 2, 3, 2, 3, + 2, 1, 1, 3, 0, 3, 2, 3, 0, 0, 2, 3, 2, 0, 3, 1, 3, 0, 1, + 3, 0, 0, 1, 0, 3, 1, 0, 1, 0, 1, 1, 0, 2, 2, 2, 2, 2, 0, + 3, 1, 2, 0, 0, 3, 1, 3, 2, 2, 0, 1, 3, 0, 2, 3, 2, 3, 3, + 2, 3, 3, 1, 3, 2, 0, 0, 3, 1, 1, 1, 0, 2, 3, 3, 1, 2, 0, + 3, 1, 2, 0, 2], dtype=np.int64) + + right = np.array([3, 1], dtype=np.int64) + max_groups = 4 + + lidx, ridx = _algos.left_outer_join(left, right, max_groups, sort=False) + + exp_lidx = np.arange(len(left)) + exp_ridx = -np.ones(len(left)) + exp_ridx[left == 1] = 1 + exp_ridx[left == 3] = 0 + + assert (np.array_equal(lidx, exp_lidx)) + assert (np.array_equal(ridx, exp_ridx)) + + +def test_inner_join_indexer(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([0, 3, 5, 7, 9], dtype=np.int64) + + index, ares, bres = _algos.inner_join_indexer_int64(a, b) + + index_exp = np.array([3, 5], dtype=np.int64) + assert_almost_equal(index, index_exp) + + aexp = np.array([2, 4], dtype=np.int64) + bexp = np.array([1, 2], dtype=np.int64) + assert_almost_equal(ares, aexp) + assert_almost_equal(bres, bexp) + + a = np.array([5], dtype=np.int64) + b = np.array([5], dtype=np.int64) + + index, ares, bres = _algos.inner_join_indexer_int64(a, b) + tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) + tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) + tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) + + +def test_outer_join_indexer(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([0, 3, 5, 7, 9], dtype=np.int64) + + index, ares, bres = _algos.outer_join_indexer_int64(a, b) + + index_exp = np.array([0, 1, 2, 3, 4, 5, 7, 9], dtype=np.int64) + assert_almost_equal(index, index_exp) + + aexp = np.array([-1, 0, 1, 2, 3, 4, -1, -1], dtype=np.int64) + bexp = np.array([0, -1, -1, 1, -1, 2, 3, 4], dtype=np.int64) + assert_almost_equal(ares, aexp) + assert_almost_equal(bres, bexp) + + a = np.array([5], dtype=np.int64) + b = np.array([5], dtype=np.int64) + + index, ares, bres = _algos.outer_join_indexer_int64(a, b) + tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) + tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) + tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) + + +def test_left_join_indexer(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([0, 3, 5, 7, 9], dtype=np.int64) + + index, ares, bres = _algos.left_join_indexer_int64(a, b) + + assert_almost_equal(index, a) + + aexp = np.array([0, 1, 2, 3, 4], dtype=np.int64) + bexp = np.array([-1, -1, 1, -1, 2], dtype=np.int64) + assert_almost_equal(ares, aexp) + assert_almost_equal(bres, bexp) + + a = np.array([5], dtype=np.int64) + b = np.array([5], dtype=np.int64) + + index, ares, bres = _algos.left_join_indexer_int64(a, b) + tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) + tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) + tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) + + +def test_left_join_indexer2(): + idx = Index([1, 1, 2, 5]) + idx2 = Index([1, 2, 5, 7, 9]) + + res, lidx, ridx = _algos.left_join_indexer_int64(idx2.values, idx.values) + + exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) + assert_almost_equal(res, exp_res) + + exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) + assert_almost_equal(lidx, exp_lidx) + + exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) + assert_almost_equal(ridx, exp_ridx) + + +def test_outer_join_indexer2(): + idx = Index([1, 1, 2, 5]) + idx2 = Index([1, 2, 5, 7, 9]) + + res, lidx, ridx = _algos.outer_join_indexer_int64(idx2.values, idx.values) + + exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) + assert_almost_equal(res, exp_res) + + exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) + assert_almost_equal(lidx, exp_lidx) + + exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) + assert_almost_equal(ridx, exp_ridx) + + +def test_inner_join_indexer2(): + idx = Index([1, 1, 2, 5]) + idx2 = Index([1, 2, 5, 7, 9]) + + res, lidx, ridx = _algos.inner_join_indexer_int64(idx2.values, idx.values) + + exp_res = np.array([1, 1, 2, 5], dtype=np.int64) + assert_almost_equal(res, exp_res) + + exp_lidx = np.array([0, 0, 1, 2], dtype=np.int64) + assert_almost_equal(lidx, exp_lidx) + + exp_ridx = np.array([0, 1, 2, 3], dtype=np.int64) + assert_almost_equal(ridx, exp_ridx) + + +def test_is_lexsorted(): + failure = [ + np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, + 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0]), + np.array([30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, + 15, 14, + 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, + 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, + 12, 11, + 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, 27, 26, 25, + 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, + 9, 8, + 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, 27, 26, 25, 24, 23, 22, + 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, + 6, 5, + 4, 3, 2, 1, 0])] + + assert (not _algos.is_lexsorted(failure)) + +# def test_get_group_index(): +# a = np.array([0, 1, 2, 0, 2, 1, 0, 0], dtype=np.int64) +# b = np.array([1, 0, 3, 2, 0, 2, 3, 0], dtype=np.int64) +# expected = np.array([1, 4, 11, 2, 8, 6, 3, 0], dtype=np.int64) + +# result = lib.get_group_index([a, b], (3, 4)) + +# assert(np.array_equal(result, expected)) + + +def test_groupsort_indexer(): + a = np.random.randint(0, 1000, 100).astype(np.int64) + b = np.random.randint(0, 1000, 100).astype(np.int64) + + result = _algos.groupsort_indexer(a, 1000)[0] + + # need to use a stable sort + expected = np.argsort(a, kind='mergesort') + assert (np.array_equal(result, expected)) + + # compare with lexsort + key = a * 1000 + b + result = _algos.groupsort_indexer(key, 1000000)[0] + expected = np.lexsort((b, a)) + assert (np.array_equal(result, expected)) + + +def test_ensure_platform_int(): + arr = np.arange(100) + + result = _algos.ensure_platform_int(arr) + assert (result is arr) + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 2fec7c591a2b7..77ae3ca20d123 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -18,7 +18,6 @@ from pandas.core.base import (FrozenList, FrozenNDArray, PandasDelegate, NoNewAttributesMixin) from pandas.tseries.base import DatetimeIndexOpsMixin -from pandas.util.testing import (assertRaisesRegexp, assertIsInstance) class CheckStringMixin(object): @@ -46,7 +45,7 @@ class CheckImmutable(object): def check_mutable_error(self, *args, **kwargs): # pass whatever functions you normally would to assertRaises (after the # Exception kind) - assertRaisesRegexp(TypeError, self.mutable_regex, *args, **kwargs) + tm.assertRaisesRegexp(TypeError, self.mutable_regex, *args, **kwargs) def test_no_mutable_funcs(self): def setitem(): @@ -79,7 +78,7 @@ def test_slicing_maintains_type(self): def check_result(self, result, expected, klass=None): klass = klass or self.klass - assertIsInstance(result, klass) + self.assertIsInstance(result, klass) self.assertEqual(result, expected) @@ -120,13 +119,13 @@ def setUp(self): def test_shallow_copying(self): original = self.container.copy() - assertIsInstance(self.container.view(), FrozenNDArray) + self.assertIsInstance(self.container.view(), FrozenNDArray) self.assertFalse(isinstance( self.container.view(np.ndarray), FrozenNDArray)) self.assertIsNot(self.container.view(), self.container) self.assert_numpy_array_equal(self.container, original) # shallow copy should be the same too - assertIsInstance(self.container._shallow_copy(), FrozenNDArray) + self.assertIsInstance(self.container._shallow_copy(), FrozenNDArray) # setting should not be allowed def testit(container): @@ -141,48 +140,53 @@ def test_values(self): self.assert_numpy_array_equal(original, vals) self.assertIsNot(original, vals) vals[0] = n - self.assert_numpy_array_equal(self.container, original) + self.assertIsInstance(self.container, pd.core.base.FrozenNDArray) + self.assert_numpy_array_equal(self.container.values(), original) self.assertEqual(vals[0], n) class TestPandasDelegate(tm.TestCase): - def setUp(self): - pass + class Delegator(object): + _properties = ['foo'] + _methods = ['bar'] - def test_invalida_delgation(self): - # these show that in order for the delegation to work - # the _delegate_* methods need to be overriden to not raise a TypeError + def _set_foo(self, value): + self.foo = value - class Delegator(object): - _properties = ['foo'] - _methods = ['bar'] + def _get_foo(self): + return self.foo - def _set_foo(self, value): - self.foo = value + foo = property(_get_foo, _set_foo, doc="foo property") - def _get_foo(self): - return self.foo + def bar(self, *args, **kwargs): + """ a test bar method """ + pass - foo = property(_get_foo, _set_foo, doc="foo property") + class Delegate(PandasDelegate): - def bar(self, *args, **kwargs): - """ a test bar method """ - pass + def __init__(self, obj): + self.obj = obj - class Delegate(PandasDelegate): + def setUp(self): + pass - def __init__(self, obj): - self.obj = obj + def test_invalida_delgation(self): + # these show that in order for the delegation to work + # the _delegate_* methods need to be overriden to not raise a TypeError - Delegate._add_delegate_accessors(delegate=Delegator, - accessors=Delegator._properties, - typ='property') - Delegate._add_delegate_accessors(delegate=Delegator, - accessors=Delegator._methods, - typ='method') + self.Delegate._add_delegate_accessors( + delegate=self.Delegator, + accessors=self.Delegator._properties, + typ='property' + ) + self.Delegate._add_delegate_accessors( + delegate=self.Delegator, + accessors=self.Delegator._methods, + typ='method' + ) - delegate = Delegate(Delegator()) + delegate = self.Delegate(self.Delegator()) def f(): delegate.foo @@ -199,6 +203,13 @@ def f(): self.assertRaises(TypeError, f) + def test_memory_usage(self): + # Delegate does not implement memory_usage. + # Check that we fall back to in-built `__sizeof__` + # GH 12924 + delegate = self.Delegate(self.Delegator()) + sys.getsizeof(delegate) + class Ops(tm.TestCase): @@ -437,7 +448,9 @@ def test_nanops(self): self.assertEqual(obj.argmax(), -1) def test_value_counts_unique_nunique(self): - for o in self.objs: + for orig in self.objs: + + o = orig.copy() klass = type(o) values = o.values @@ -474,13 +487,11 @@ def test_value_counts_unique_nunique(self): else: expected_index = pd.Index(values[::-1]) idx = o.index.repeat(range(1, len(o) + 1)) - o = klass( - np.repeat(values, range(1, - len(o) + 1)), index=idx, name='a') + o = klass(np.repeat(values, range(1, len(o) + 1)), + index=idx, name='a') - expected_s = Series( - range(10, 0, - - 1), index=expected_index, dtype='int64', name='a') + expected_s = Series(range(10, 0, -1), index=expected_index, + dtype='int64', name='a') result = o.value_counts() tm.assert_series_equal(result, expected_s) @@ -490,10 +501,10 @@ def test_value_counts_unique_nunique(self): result = o.unique() if isinstance(o, (DatetimeIndex, PeriodIndex)): self.assertTrue(isinstance(result, o.__class__)) - self.assertEqual(result.name, o.name) self.assertEqual(result.freq, o.freq) - - self.assert_numpy_array_equal(result, values) + self.assert_index_equal(result, orig) + else: + self.assert_numpy_array_equal(result, values) self.assertEqual(o.nunique(), len(np.unique(o.values))) @@ -530,9 +541,8 @@ def test_value_counts_unique_nunique(self): # resets name from Index expected_index = pd.Index(o, name=None) # attach name to klass - o = klass( - np.repeat(values, range( - 1, len(o) + 1)), freq=o.freq, name='a') + o = klass(np.repeat(values, range(1, len(o) + 1)), + freq=o.freq, name='a') elif isinstance(o, Index): expected_index = pd.Index(values, name=None) o = klass( @@ -599,6 +609,12 @@ def test_value_counts_inferred(self): expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c']) tm.assert_series_equal(hist, expected) + def test_value_counts_bins(self): + klasses = [Index, Series] + for klass in klasses: + s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] + s = klass(s_values) + # bins self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1) @@ -649,6 +665,9 @@ def test_value_counts_inferred(self): check_dtype=False) self.assertEqual(s.nunique(), 0) + def test_value_counts_datetime64(self): + klasses = [Index, Series] + for klass in klasses: # GH 3002, datetime64[ns] # don't test names though txt = "\n".join(['xxyyzz20100101PIE', 'xxyyzz20100101GUM', @@ -662,9 +681,9 @@ def test_value_counts_inferred(self): s = klass(df['dt'].copy()) s.name = None - idx = pd.to_datetime( - ['2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z', - '2009-01-01 00:00:00X']) + idx = pd.to_datetime(['2010-01-01 00:00:00Z', + '2008-09-09 00:00:00Z', + '2009-01-01 00:00:00X']) expected_s = Series([3, 2, 1], index=idx) tm.assert_series_equal(s.value_counts(), expected_s) @@ -673,8 +692,7 @@ def test_value_counts_inferred(self): '2008-09-09 00:00:00Z'], dtype='datetime64[ns]') if isinstance(s, DatetimeIndex): - expected = DatetimeIndex(expected) - self.assertTrue(s.unique().equals(expected)) + self.assert_index_equal(s.unique(), DatetimeIndex(expected)) else: self.assert_numpy_array_equal(s.unique(), expected) @@ -696,9 +714,12 @@ def test_value_counts_inferred(self): self.assertEqual(unique.dtype, 'datetime64[ns]') # numpy_array_equal cannot compare pd.NaT - self.assert_numpy_array_equal(unique[:3], expected) - self.assertTrue(unique[3] is pd.NaT or unique[3].astype('int64') == - pd.tslib.iNaT) + if isinstance(s, DatetimeIndex): + self.assert_index_equal(unique[:3], DatetimeIndex(expected)) + else: + self.assert_numpy_array_equal(unique[:3], expected) + self.assertTrue(unique[3] is pd.NaT or + unique[3].astype('int64') == pd.tslib.iNaT) self.assertEqual(s.nunique(), 3) self.assertEqual(s.nunique(dropna=False), 4) @@ -711,9 +732,9 @@ def test_value_counts_inferred(self): expected_s = Series([6], index=[Timedelta('1day')], name='dt') tm.assert_series_equal(result, expected_s) - expected = TimedeltaIndex(['1 days']) + expected = TimedeltaIndex(['1 days'], name='dt') if isinstance(td, TimedeltaIndex): - self.assertTrue(td.unique().equals(expected)) + self.assert_index_equal(td.unique(), expected) else: self.assert_numpy_array_equal(td.unique(), expected.values) @@ -723,7 +744,8 @@ def test_value_counts_inferred(self): tm.assert_series_equal(result2, expected_s) def test_factorize(self): - for o in self.objs: + for orig in self.objs: + o = orig.copy() if isinstance(o, Index) and o.is_boolean(): exp_arr = np.array([0, 1] + [0] * 8) @@ -736,12 +758,16 @@ def test_factorize(self): self.assert_numpy_array_equal(labels, exp_arr) if isinstance(o, Series): - expected = Index(o.values) - self.assert_numpy_array_equal(uniques, expected) + self.assert_index_equal(uniques, Index(orig), + check_names=False) else: - self.assertTrue(uniques.equals(exp_uniques)) + # factorize explicitly resets name + self.assert_index_equal(uniques, exp_uniques, + check_names=False) - for o in self.objs: + def test_factorize_repeated(self): + for orig in self.objs: + o = orig.copy() # don't test boolean if isinstance(o, Index) and o.is_boolean(): @@ -761,27 +787,25 @@ def test_factorize(self): self.assert_numpy_array_equal(labels, exp_arr) if isinstance(o, Series): - expected = Index(o.values) - self.assert_numpy_array_equal(uniques, expected) + self.assert_index_equal(uniques, Index(orig).sort_values(), + check_names=False) else: - self.assertTrue(uniques.equals(o)) + self.assert_index_equal(uniques, o, check_names=False) exp_arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4]) labels, uniques = n.factorize(sort=False) self.assert_numpy_array_equal(labels, exp_arr) if isinstance(o, Series): - expected = Index(np.concatenate([o.values[5:10], o.values[:5] - ])) - self.assert_numpy_array_equal(uniques, expected) + expected = Index(o.iloc[5:10].append(o.iloc[:5])) + self.assert_index_equal(uniques, expected, check_names=False) else: - expected = o[5:].append(o[:5]) - self.assertTrue(uniques.equals(expected)) + expected = o[5:10].append(o[:5]) + self.assert_index_equal(uniques, expected, check_names=False) - def test_duplicated_drop_duplicates(self): + def test_duplicated_drop_duplicates_index(self): # GH 4060 for original in self.objs: - if isinstance(original, Index): # special case diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 55df64264d6f9..cff5bbe14f1eb 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -34,10 +34,12 @@ def test_getitem(self): self.assertEqual(self.factor[-1], 'c') subf = self.factor[[0, 1, 2]] - tm.assert_almost_equal(subf._codes, [0, 1, 1]) + tm.assert_numpy_array_equal(subf._codes, + np.array([0, 1, 1], dtype=np.int8)) subf = self.factor[np.asarray(self.factor) == 'c'] - tm.assert_almost_equal(subf._codes, [2, 2, 2]) + tm.assert_numpy_array_equal(subf._codes, + np.array([2, 2, 2], dtype=np.int8)) def test_getitem_listlike(self): @@ -157,39 +159,39 @@ def f(): # Categorical as input c1 = Categorical(["a", "b", "c", "a"]) c2 = Categorical(c1) - self.assertTrue(c1.equals(c2)) + tm.assert_categorical_equal(c1, c2) c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) c2 = Categorical(c1) - self.assertTrue(c1.equals(c2)) + tm.assert_categorical_equal(c1, c2) c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"]) c2 = Categorical(c1) - self.assertTrue(c1.equals(c2)) + tm.assert_categorical_equal(c1, c2) c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"]) c2 = Categorical(c1, categories=["a", "b", "c"]) self.assert_numpy_array_equal(c1.__array__(), c2.__array__()) - self.assert_numpy_array_equal(c2.categories, np.array(["a", "b", "c"])) + self.assert_index_equal(c2.categories, Index(["a", "b", "c"])) # Series of dtype category c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) c2 = Categorical(Series(c1)) - self.assertTrue(c1.equals(c2)) + tm.assert_categorical_equal(c1, c2) c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"]) c2 = Categorical(Series(c1)) - self.assertTrue(c1.equals(c2)) + tm.assert_categorical_equal(c1, c2) # Series c1 = Categorical(["a", "b", "c", "a"]) c2 = Categorical(Series(["a", "b", "c", "a"])) - self.assertTrue(c1.equals(c2)) + tm.assert_categorical_equal(c1, c2) c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) - c2 = Categorical( - Series(["a", "b", "c", "a"]), categories=["a", "b", "c", "d"]) - self.assertTrue(c1.equals(c2)) + c2 = Categorical(Series(["a", "b", "c", "a"]), + categories=["a", "b", "c", "d"]) + tm.assert_categorical_equal(c1, c2) # This should result in integer categories, not float! cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) @@ -281,11 +283,12 @@ def f(): def test_constructor_with_index(self): ci = CategoricalIndex(list('aabbca'), categories=list('cab')) - self.assertTrue(ci.values.equals(Categorical(ci))) + tm.assert_categorical_equal(ci.values, Categorical(ci)) ci = CategoricalIndex(list('aabbca'), categories=list('cab')) - self.assertTrue(ci.values.equals(Categorical( - ci.astype(object), categories=ci.categories))) + tm.assert_categorical_equal(ci.values, + Categorical(ci.astype(object), + categories=ci.categories)) def test_constructor_with_generator(self): # This was raising an Error in isnull(single_val).any() because isnull @@ -294,9 +297,9 @@ def test_constructor_with_generator(self): exp = Categorical([0, 1, 2]) cat = Categorical((x for x in [0, 1, 2])) - self.assertTrue(cat.equals(exp)) + tm.assert_categorical_equal(cat, exp) cat = Categorical(xrange(3)) - self.assertTrue(cat.equals(exp)) + tm.assert_categorical_equal(cat, exp) # This uses xrange internally from pandas.core.index import MultiIndex @@ -304,9 +307,9 @@ def test_constructor_with_generator(self): # check that categories accept generators and sequences cat = pd.Categorical([0, 1, 2], categories=(x for x in [0, 1, 2])) - self.assertTrue(cat.equals(exp)) + tm.assert_categorical_equal(cat, exp) cat = pd.Categorical([0, 1, 2], categories=xrange(3)) - self.assertTrue(cat.equals(exp)) + tm.assert_categorical_equal(cat, exp) def test_constructor_with_datetimelike(self): @@ -393,7 +396,7 @@ def f(): exp = Categorical(["a", "b", "c"], ordered=False) res = Categorical.from_codes([0, 1, 2], ["a", "b", "c"]) - self.assertTrue(exp.equals(res)) + tm.assert_categorical_equal(exp, res) # Not available in earlier numpy versions if hasattr(np.random, "choice"): @@ -404,27 +407,27 @@ def test_comparisons(self): result = self.factor[self.factor == 'a'] expected = self.factor[np.asarray(self.factor) == 'a'] - self.assertTrue(result.equals(expected)) + tm.assert_categorical_equal(result, expected) result = self.factor[self.factor != 'a'] expected = self.factor[np.asarray(self.factor) != 'a'] - self.assertTrue(result.equals(expected)) + tm.assert_categorical_equal(result, expected) result = self.factor[self.factor < 'c'] expected = self.factor[np.asarray(self.factor) < 'c'] - self.assertTrue(result.equals(expected)) + tm.assert_categorical_equal(result, expected) result = self.factor[self.factor > 'a'] expected = self.factor[np.asarray(self.factor) > 'a'] - self.assertTrue(result.equals(expected)) + tm.assert_categorical_equal(result, expected) result = self.factor[self.factor >= 'b'] expected = self.factor[np.asarray(self.factor) >= 'b'] - self.assertTrue(result.equals(expected)) + tm.assert_categorical_equal(result, expected) result = self.factor[self.factor <= 'b'] expected = self.factor[np.asarray(self.factor) <= 'b'] - self.assertTrue(result.equals(expected)) + tm.assert_categorical_equal(result, expected) n = len(self.factor) @@ -551,33 +554,40 @@ def test_na_flags_int_categories(self): def test_categories_none(self): factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'], ordered=True) - self.assertTrue(factor.equals(self.factor)) + tm.assert_categorical_equal(factor, self.factor) def test_describe(self): # string type desc = self.factor.describe() + self.assertTrue(self.factor.ordered) + exp_index = pd.CategoricalIndex(['a', 'b', 'c'], name='categories', + ordered=self.factor.ordered) expected = DataFrame({'counts': [3, 2, 3], 'freqs': [3 / 8., 2 / 8., 3 / 8.]}, - index=pd.CategoricalIndex(['a', 'b', 'c'], - name='categories')) + index=exp_index) tm.assert_frame_equal(desc, expected) # check unused categories cat = self.factor.copy() cat.set_categories(["a", "b", "c", "d"], inplace=True) desc = cat.describe() + + exp_index = pd.CategoricalIndex(['a', 'b', 'c', 'd'], + ordered=self.factor.ordered, + name='categories') expected = DataFrame({'counts': [3, 2, 3, 0], 'freqs': [3 / 8., 2 / 8., 3 / 8., 0]}, - index=pd.CategoricalIndex(['a', 'b', 'c', 'd'], - name='categories')) + index=exp_index) tm.assert_frame_equal(desc, expected) # check an integer one - desc = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1]).describe() + cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1]) + desc = cat.describe() + exp_index = pd.CategoricalIndex([1, 2, 3], ordered=cat.ordered, + name='categories') expected = DataFrame({'counts': [5, 3, 3], 'freqs': [5 / 11., 3 / 11., 3 / 11.]}, - index=pd.CategoricalIndex([1, 2, 3], - name='categories')) + index=exp_index) tm.assert_frame_equal(desc, expected) # https://github.com/pydata/pandas/issues/3678 @@ -601,7 +611,7 @@ def test_describe(self): columns=['counts', 'freqs'], index=pd.CategoricalIndex(['b', 'a', 'c', np.nan], name='categories')) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_categorical=False) # NA as an unused category with tm.assert_produces_warning(FutureWarning): @@ -613,7 +623,7 @@ def test_describe(self): ['b', 'a', 'c', np.nan], name='categories') expected = DataFrame([[0, 0], [1, 1 / 3.], [2, 2 / 3.], [0, 0]], columns=['counts', 'freqs'], index=exp_idx) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_categorical=False) def test_print(self): expected = ["[a, b, b, a, a, c, c, c]", @@ -703,7 +713,7 @@ def test_periodindex(self): exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.int8) exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') self.assert_numpy_array_equal(cat1._codes, exp_arr) - self.assertTrue(cat1.categories.equals(exp_idx)) + self.assert_index_equal(cat1.categories, exp_idx) idx2 = PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01', '2014-03', '2014-01'], freq='M') @@ -712,7 +722,7 @@ def test_periodindex(self): exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.int8) exp_idx2 = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') self.assert_numpy_array_equal(cat2._codes, exp_arr) - self.assertTrue(cat2.categories.equals(exp_idx2)) + self.assert_index_equal(cat2.categories, exp_idx2) idx3 = PeriodIndex(['2013-12', '2013-11', '2013-10', '2013-09', '2013-08', '2013-07', '2013-05'], freq='M') @@ -721,15 +731,14 @@ def test_periodindex(self): exp_idx = PeriodIndex(['2013-05', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12'], freq='M') self.assert_numpy_array_equal(cat3._codes, exp_arr) - self.assertTrue(cat3.categories.equals(exp_idx)) + self.assert_index_equal(cat3.categories, exp_idx) def test_categories_assigments(self): s = pd.Categorical(["a", "b", "c", "a"]) exp = np.array([1, 2, 3, 1], dtype=np.int64) s.categories = [1, 2, 3] self.assert_numpy_array_equal(s.__array__(), exp) - self.assert_numpy_array_equal(s.categories, - np.array([1, 2, 3], dtype=np.int64)) + self.assert_index_equal(s.categories, Index([1, 2, 3])) # lengthen def f(): @@ -755,21 +764,21 @@ def test_construction_with_ordered(self): def test_ordered_api(self): # GH 9347 cat1 = pd.Categorical(["a", "c", "b"], ordered=False) - self.assertTrue(cat1.categories.equals(Index(['a', 'b', 'c']))) + self.assert_index_equal(cat1.categories, Index(['a', 'b', 'c'])) self.assertFalse(cat1.ordered) cat2 = pd.Categorical(["a", "c", "b"], categories=['b', 'c', 'a'], ordered=False) - self.assertTrue(cat2.categories.equals(Index(['b', 'c', 'a']))) + self.assert_index_equal(cat2.categories, Index(['b', 'c', 'a'])) self.assertFalse(cat2.ordered) cat3 = pd.Categorical(["a", "c", "b"], ordered=True) - self.assertTrue(cat3.categories.equals(Index(['a', 'b', 'c']))) + self.assert_index_equal(cat3.categories, Index(['a', 'b', 'c'])) self.assertTrue(cat3.ordered) cat4 = pd.Categorical(["a", "c", "b"], categories=['b', 'c', 'a'], ordered=True) - self.assertTrue(cat4.categories.equals(Index(['b', 'c', 'a']))) + self.assert_index_equal(cat4.categories, Index(['b', 'c', 'a'])) self.assertTrue(cat4.ordered) def test_set_ordered(self): @@ -801,21 +810,21 @@ def test_set_ordered(self): def test_set_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) - exp_categories = np.array(["c", "b", "a"], dtype=np.object_) + exp_categories = Index(["c", "b", "a"]) exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_) res = cat.set_categories(["c", "b", "a"], inplace=True) - self.assert_numpy_array_equal(cat.categories, exp_categories) + self.assert_index_equal(cat.categories, exp_categories) self.assert_numpy_array_equal(cat.__array__(), exp_values) self.assertIsNone(res) res = cat.set_categories(["a", "b", "c"]) # cat must be the same as before - self.assert_numpy_array_equal(cat.categories, exp_categories) + self.assert_index_equal(cat.categories, exp_categories) self.assert_numpy_array_equal(cat.__array__(), exp_values) # only res is changed - exp_categories_back = np.array(["a", "b", "c"]) - self.assert_numpy_array_equal(res.categories, exp_categories_back) + exp_categories_back = Index(["a", "b", "c"]) + self.assert_index_equal(res.categories, exp_categories_back) self.assert_numpy_array_equal(res.__array__(), exp_values) # not all "old" included in "new" -> all not included ones are now @@ -829,19 +838,18 @@ def test_set_categories(self): res = cat.set_categories(["a", "b", "d"]) self.assert_numpy_array_equal(res.codes, np.array([0, 1, -1, 0], dtype=np.int8)) - self.assert_numpy_array_equal(res.categories, - np.array(["a", "b", "d"])) + self.assert_index_equal(res.categories, Index(["a", "b", "d"])) # all "old" included in "new" cat = cat.set_categories(["a", "b", "c", "d"]) - exp_categories = np.array(["a", "b", "c", "d"], dtype=np.object_) - self.assert_numpy_array_equal(cat.categories, exp_categories) + exp_categories = Index(["a", "b", "c", "d"]) + self.assert_index_equal(cat.categories, exp_categories) # internals... c = Categorical([1, 2, 3, 4, 1], categories=[1, 2, 3, 4], ordered=True) self.assert_numpy_array_equal(c._codes, np.array([0, 1, 2, 3, 0], dtype=np.int8)) - self.assert_numpy_array_equal(c.categories, np.array([1, 2, 3, 4])) + self.assert_index_equal(c.categories, Index([1, 2, 3, 4])) exp = np.array([1, 2, 3, 4, 1], dtype=np.int64) self.assert_numpy_array_equal(c.get_values(), exp) @@ -854,7 +862,7 @@ def test_set_categories(self): np.array([3, 2, 1, 0, 3], dtype=np.int8)) # categories are now in new order - self.assert_numpy_array_equal(c.categories, np.array([4, 3, 2, 1])) + self.assert_index_equal(c.categories, Index([4, 3, 2, 1])) # output is the same exp = np.array([1, 2, 3, 4, 1], dtype=np.int64) @@ -879,22 +887,20 @@ def test_rename_categories(self): res = cat.rename_categories([1, 2, 3]) self.assert_numpy_array_equal(res.__array__(), np.array([1, 2, 3, 1], dtype=np.int64)) - self.assert_numpy_array_equal(res.categories, - np.array([1, 2, 3], dtype=np.int64)) + self.assert_index_equal(res.categories, Index([1, 2, 3])) exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_) self.assert_numpy_array_equal(cat.__array__(), exp_cat) - exp_cat = np.array(["a", "b", "c"], dtype=np.object_) - self.assert_numpy_array_equal(cat.categories, exp_cat) + exp_cat = Index(["a", "b", "c"]) + self.assert_index_equal(cat.categories, exp_cat) res = cat.rename_categories([1, 2, 3], inplace=True) # and now inplace self.assertIsNone(res) self.assert_numpy_array_equal(cat.__array__(), np.array([1, 2, 3, 1], dtype=np.int64)) - self.assert_numpy_array_equal(cat.categories, - np.array([1, 2, 3], dtype=np.int64)) + self.assert_index_equal(cat.categories, Index([1, 2, 3])) # lengthen def f(): @@ -1015,32 +1021,35 @@ def f(): def test_remove_unused_categories(self): c = Categorical(["a", "b", "c", "d", "a"], categories=["a", "b", "c", "d", "e"]) - exp_categories_all = np.array(["a", "b", "c", "d", "e"]) - exp_categories_dropped = np.array(["a", "b", "c", "d"]) + exp_categories_all = Index(["a", "b", "c", "d", "e"]) + exp_categories_dropped = Index(["a", "b", "c", "d"]) - self.assert_numpy_array_equal(c.categories, exp_categories_all) + self.assert_index_equal(c.categories, exp_categories_all) res = c.remove_unused_categories() - self.assert_numpy_array_equal(res.categories, exp_categories_dropped) - self.assert_numpy_array_equal(c.categories, exp_categories_all) + self.assert_index_equal(res.categories, exp_categories_dropped) + self.assert_index_equal(c.categories, exp_categories_all) res = c.remove_unused_categories(inplace=True) - self.assert_numpy_array_equal(c.categories, exp_categories_dropped) + self.assert_index_equal(c.categories, exp_categories_dropped) self.assertIsNone(res) # with NaN values (GH11599) c = Categorical(["a", "b", "c", np.nan], categories=["a", "b", "c", "d", "e"]) res = c.remove_unused_categories() - self.assert_numpy_array_equal(res.categories, - np.array(["a", "b", "c"])) - self.assert_numpy_array_equal(c.categories, exp_categories_all) + self.assert_index_equal(res.categories, + Index(np.array(["a", "b", "c"]))) + exp_codes = np.array([0, 1, 2, -1], dtype=np.int8) + self.assert_numpy_array_equal(res.codes, exp_codes) + self.assert_index_equal(c.categories, exp_categories_all) val = ['F', np.nan, 'D', 'B', 'D', 'F', np.nan] cat = pd.Categorical(values=val, categories=list('ABCDEFG')) out = cat.remove_unused_categories() - self.assert_numpy_array_equal(out.categories, ['B', 'D', 'F']) - self.assert_numpy_array_equal(out.codes, [2, -1, 1, 0, 1, 2, -1]) + self.assert_index_equal(out.categories, Index(['B', 'D', 'F'])) + exp_codes = np.array([2, -1, 1, 0, 1, 2, -1], dtype=np.int8) + self.assert_numpy_array_equal(out.codes, exp_codes) self.assertEqual(out.get_values().tolist(), val) alpha = list('abcdefghijklmnopqrstuvwxyz') @@ -1055,11 +1064,11 @@ def test_nan_handling(self): # Nans are represented as -1 in codes c = Categorical(["a", "b", np.nan, "a"]) - self.assert_numpy_array_equal(c.categories, np.array(["a", "b"])) + self.assert_index_equal(c.categories, Index(["a", "b"])) self.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8)) c[1] = np.nan - self.assert_numpy_array_equal(c.categories, np.array(["a", "b"])) + self.assert_index_equal(c.categories, Index(["a", "b"])) self.assert_numpy_array_equal(c._codes, np.array([0, -1, -1, 0], dtype=np.int8)) @@ -1068,15 +1077,11 @@ def test_nan_handling(self): with tm.assert_produces_warning(FutureWarning): c = Categorical(["a", "b", np.nan, "a"], categories=["a", "b", np.nan]) - self.assert_numpy_array_equal(c.categories, - np.array(["a", "b", np.nan], - dtype=np.object_)) + self.assert_index_equal(c.categories, Index(["a", "b", np.nan])) self.assert_numpy_array_equal(c._codes, np.array([0, 1, 2, 0], dtype=np.int8)) c[1] = np.nan - self.assert_numpy_array_equal(c.categories, - np.array(["a", "b", np.nan], - dtype=np.object_)) + self.assert_index_equal(c.categories, Index(["a", "b", np.nan])) self.assert_numpy_array_equal(c._codes, np.array([0, 2, 2, 0], dtype=np.int8)) @@ -1085,30 +1090,24 @@ def test_nan_handling(self): with tm.assert_produces_warning(FutureWarning): c.categories = ["a", "b", np.nan] # noqa - self.assert_numpy_array_equal(c.categories, - np.array(["a", "b", np.nan], - dtype=np.object_)) + self.assert_index_equal(c.categories, Index(["a", "b", np.nan])) self.assert_numpy_array_equal(c._codes, np.array([0, 1, 2, 0], dtype=np.int8)) # Adding nan to categories should make assigned nan point to the # category! c = Categorical(["a", "b", np.nan, "a"]) - self.assert_numpy_array_equal(c.categories, np.array(["a", "b"])) + self.assert_index_equal(c.categories, Index(["a", "b"])) self.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8)) with tm.assert_produces_warning(FutureWarning): c.set_categories(["a", "b", np.nan], rename=True, inplace=True) - self.assert_numpy_array_equal(c.categories, - np.array(["a", "b", np.nan], - dtype=np.object_)) + self.assert_index_equal(c.categories, Index(["a", "b", np.nan])) self.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8)) c[1] = np.nan - self.assert_numpy_array_equal(c.categories, - np.array(["a", "b", np.nan], - dtype=np.object_)) + self.assert_index_equal(c.categories, Index(["a", "b", np.nan])) self.assert_numpy_array_equal(c._codes, np.array([0, 2, -1, 0], dtype=np.int8)) @@ -1234,63 +1233,58 @@ def test_min_max(self): def test_unique(self): # categories are reordered based on value when ordered=False cat = Categorical(["a", "b"]) - exp = np.asarray(["a", "b"]) + exp = Index(["a", "b"]) res = cat.unique() - self.assert_numpy_array_equal(res, exp) + self.assert_index_equal(res.categories, exp) + self.assert_categorical_equal(res, cat) cat = Categorical(["a", "b", "a", "a"], categories=["a", "b", "c"]) res = cat.unique() - self.assert_numpy_array_equal(res, exp) + self.assert_index_equal(res.categories, exp) tm.assert_categorical_equal(res, Categorical(exp)) cat = Categorical(["c", "a", "b", "a", "a"], categories=["a", "b", "c"]) - exp = np.asarray(["c", "a", "b"]) + exp = Index(["c", "a", "b"]) res = cat.unique() - self.assert_numpy_array_equal(res, exp) - tm.assert_categorical_equal(res, Categorical( - exp, categories=['c', 'a', 'b'])) + self.assert_index_equal(res.categories, exp) + exp_cat = Categorical(exp, categories=['c', 'a', 'b']) + tm.assert_categorical_equal(res, exp_cat) # nan must be removed cat = Categorical(["b", np.nan, "b", np.nan, "a"], categories=["a", "b", "c"]) res = cat.unique() - exp = np.asarray(["b", np.nan, "a"], dtype=object) - self.assert_numpy_array_equal(res, exp) - tm.assert_categorical_equal(res, Categorical( - ["b", np.nan, "a"], categories=["b", "a"])) + exp = Index(["b", "a"]) + self.assert_index_equal(res.categories, exp) + exp_cat = Categorical(["b", np.nan, "a"], categories=["b", "a"]) + tm.assert_categorical_equal(res, exp_cat) def test_unique_ordered(self): # keep categories order when ordered=True cat = Categorical(['b', 'a', 'b'], categories=['a', 'b'], ordered=True) res = cat.unique() - exp = np.asarray(['b', 'a']) - exp_cat = Categorical(exp, categories=['a', 'b'], ordered=True) - self.assert_numpy_array_equal(res, exp) + exp_cat = Categorical(['b', 'a'], categories=['a', 'b'], ordered=True) tm.assert_categorical_equal(res, exp_cat) cat = Categorical(['c', 'b', 'a', 'a'], categories=['a', 'b', 'c'], ordered=True) res = cat.unique() - exp = np.asarray(['c', 'b', 'a']) - exp_cat = Categorical(exp, categories=['a', 'b', 'c'], ordered=True) - self.assert_numpy_array_equal(res, exp) + exp_cat = Categorical(['c', 'b', 'a'], categories=['a', 'b', 'c'], + ordered=True) tm.assert_categorical_equal(res, exp_cat) cat = Categorical(['b', 'a', 'a'], categories=['a', 'b', 'c'], ordered=True) res = cat.unique() - exp = np.asarray(['b', 'a']) - exp_cat = Categorical(exp, categories=['a', 'b'], ordered=True) - self.assert_numpy_array_equal(res, exp) + exp_cat = Categorical(['b', 'a'], categories=['a', 'b'], ordered=True) tm.assert_categorical_equal(res, exp_cat) cat = Categorical(['b', 'b', np.nan, 'a'], categories=['a', 'b', 'c'], ordered=True) res = cat.unique() - exp = np.asarray(['b', np.nan, 'a'], dtype=object) - exp_cat = Categorical(exp, categories=['a', 'b'], ordered=True) - self.assert_numpy_array_equal(res, exp) + exp_cat = Categorical(['b', np.nan, 'a'], categories=['a', 'b'], + ordered=True) tm.assert_categorical_equal(res, exp_cat) def test_mode(self): @@ -1298,33 +1292,33 @@ def test_mode(self): ordered=True) res = s.mode() exp = Categorical([5], categories=[5, 4, 3, 2, 1], ordered=True) - self.assertTrue(res.equals(exp)) + tm.assert_categorical_equal(res, exp) s = Categorical([1, 1, 1, 4, 5, 5, 5], categories=[5, 4, 3, 2, 1], ordered=True) res = s.mode() exp = Categorical([5, 1], categories=[5, 4, 3, 2, 1], ordered=True) - self.assertTrue(res.equals(exp)) + tm.assert_categorical_equal(res, exp) s = Categorical([1, 2, 3, 4, 5], categories=[5, 4, 3, 2, 1], ordered=True) res = s.mode() exp = Categorical([], categories=[5, 4, 3, 2, 1], ordered=True) - self.assertTrue(res.equals(exp)) + tm.assert_categorical_equal(res, exp) # NaN should not become the mode! s = Categorical([np.nan, np.nan, np.nan, 4, 5], categories=[5, 4, 3, 2, 1], ordered=True) res = s.mode() exp = Categorical([], categories=[5, 4, 3, 2, 1], ordered=True) - self.assertTrue(res.equals(exp)) + tm.assert_categorical_equal(res, exp) s = Categorical([np.nan, np.nan, np.nan, 4, 5, 4], categories=[5, 4, 3, 2, 1], ordered=True) res = s.mode() exp = Categorical([4], categories=[5, 4, 3, 2, 1], ordered=True) - self.assertTrue(res.equals(exp)) + tm.assert_categorical_equal(res, exp) s = Categorical([np.nan, np.nan, 4, 5, 4], categories=[5, 4, 3, 2, 1], ordered=True) res = s.mode() exp = Categorical([4], categories=[5, 4, 3, 2, 1], ordered=True) - self.assertTrue(res.equals(exp)) + tm.assert_categorical_equal(res, exp) def test_sort_values(self): @@ -1338,79 +1332,83 @@ def test_sort_values(self): res = cat.sort_values() exp = np.array(["a", "b", "c", "d"], dtype=object) self.assert_numpy_array_equal(res.__array__(), exp) + self.assert_index_equal(res.categories, cat.categories) cat = Categorical(["a", "c", "b", "d"], categories=["a", "b", "c", "d"], ordered=True) res = cat.sort_values() exp = np.array(["a", "b", "c", "d"], dtype=object) self.assert_numpy_array_equal(res.__array__(), exp) + self.assert_index_equal(res.categories, cat.categories) res = cat.sort_values(ascending=False) exp = np.array(["d", "c", "b", "a"], dtype=object) self.assert_numpy_array_equal(res.__array__(), exp) + self.assert_index_equal(res.categories, cat.categories) # sort (inplace order) cat1 = cat.copy() cat1.sort_values(inplace=True) exp = np.array(["a", "b", "c", "d"], dtype=object) self.assert_numpy_array_equal(cat1.__array__(), exp) + self.assert_index_equal(res.categories, cat.categories) # reverse cat = Categorical(["a", "c", "c", "b", "d"], ordered=True) res = cat.sort_values(ascending=False) exp_val = np.array(["d", "c", "c", "b", "a"], dtype=object) - exp_categories = np.array(["a", "b", "c", "d"], dtype=object) + exp_categories = Index(["a", "b", "c", "d"]) self.assert_numpy_array_equal(res.__array__(), exp_val) - self.assert_numpy_array_equal(res.categories, exp_categories) + self.assert_index_equal(res.categories, exp_categories) def test_sort_values_na_position(self): # see gh-12882 cat = Categorical([5, 2, np.nan, 2, np.nan], ordered=True) - exp_categories = np.array([2, 5]) + exp_categories = Index([2, 5]) exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan]) res = cat.sort_values() # default arguments self.assert_numpy_array_equal(res.__array__(), exp) - self.assert_numpy_array_equal(res.categories, exp_categories) + self.assert_index_equal(res.categories, exp_categories) exp = np.array([np.nan, np.nan, 2.0, 2.0, 5.0]) res = cat.sort_values(ascending=True, na_position='first') self.assert_numpy_array_equal(res.__array__(), exp) - self.assert_numpy_array_equal(res.categories, exp_categories) + self.assert_index_equal(res.categories, exp_categories) exp = np.array([np.nan, np.nan, 5.0, 2.0, 2.0]) res = cat.sort_values(ascending=False, na_position='first') self.assert_numpy_array_equal(res.__array__(), exp) - self.assert_numpy_array_equal(res.categories, exp_categories) + self.assert_index_equal(res.categories, exp_categories) exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan]) res = cat.sort_values(ascending=True, na_position='last') self.assert_numpy_array_equal(res.__array__(), exp) - self.assert_numpy_array_equal(res.categories, exp_categories) + self.assert_index_equal(res.categories, exp_categories) exp = np.array([5.0, 2.0, 2.0, np.nan, np.nan]) res = cat.sort_values(ascending=False, na_position='last') self.assert_numpy_array_equal(res.__array__(), exp) - self.assert_numpy_array_equal(res.categories, exp_categories) + self.assert_index_equal(res.categories, exp_categories) cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True) res = cat.sort_values(ascending=False, na_position='last') exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object) - exp_categories = np.array(["a", "b", "c", "d"], dtype=object) + exp_categories = Index(["a", "b", "c", "d"]) self.assert_numpy_array_equal(res.__array__(), exp_val) - self.assert_numpy_array_equal(res.categories, exp_categories) + self.assert_index_equal(res.categories, exp_categories) cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True) res = cat.sort_values(ascending=False, na_position='first') exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object) - exp_categories = np.array(["a", "b", "c", "d"], dtype=object) + exp_categories = Index(["a", "b", "c", "d"]) self.assert_numpy_array_equal(res.__array__(), exp_val) - self.assert_numpy_array_equal(res.categories, exp_categories) + self.assert_index_equal(res.categories, exp_categories) def test_slicing_directly(self): cat = Categorical(["a", "b", "c", "d", "a", "b", "c"]) sliced = cat[3] - tm.assert_equal(sliced, "d") + self.assertEqual(sliced, "d") sliced = cat[3:5] expected = Categorical(["d", "a"], categories=['a', 'b', 'c', 'd']) self.assert_numpy_array_equal(sliced._codes, expected._codes) @@ -1420,7 +1418,7 @@ def test_set_item_nan(self): cat = pd.Categorical([1, 2, 3]) exp = pd.Categorical([1, np.nan, 3], categories=[1, 2, 3]) cat[1] = np.nan - self.assertTrue(cat.equals(exp)) + tm.assert_categorical_equal(cat, exp) # if nan in categories, the proper code should be set! cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) @@ -1560,10 +1558,10 @@ def test_deprecated_levels(self): exp = cat.categories with tm.assert_produces_warning(FutureWarning): res = cat.levels - self.assert_numpy_array_equal(res, exp) + self.assert_index_equal(res, exp) with tm.assert_produces_warning(FutureWarning): res = pd.Categorical([1, 2, 3, np.nan], levels=[1, 2, 3]) - self.assert_numpy_array_equal(res.categories, exp) + self.assert_index_equal(res.categories, exp) def test_removed_names_produces_warning(self): @@ -1577,14 +1575,18 @@ def test_removed_names_produces_warning(self): def test_datetime_categorical_comparison(self): dt_cat = pd.Categorical( pd.date_range('2014-01-01', periods=3), ordered=True) - self.assert_numpy_array_equal(dt_cat > dt_cat[0], [False, True, True]) - self.assert_numpy_array_equal(dt_cat[0] < dt_cat, [False, True, True]) + self.assert_numpy_array_equal(dt_cat > dt_cat[0], + np.array([False, True, True])) + self.assert_numpy_array_equal(dt_cat[0] < dt_cat, + np.array([False, True, True])) def test_reflected_comparison_with_scalars(self): # GH8658 cat = pd.Categorical([1, 2, 3], ordered=True) - self.assert_numpy_array_equal(cat > cat[0], [False, True, True]) - self.assert_numpy_array_equal(cat[0] < cat, [False, True, True]) + self.assert_numpy_array_equal(cat > cat[0], + np.array([False, True, True])) + self.assert_numpy_array_equal(cat[0] < cat, + np.array([False, True, True])) def test_comparison_with_unknown_scalars(self): # https://github.com/pydata/pandas/issues/9836#issuecomment-92123057 @@ -1597,8 +1599,10 @@ def test_comparison_with_unknown_scalars(self): self.assertRaises(TypeError, lambda: 4 < cat) self.assertRaises(TypeError, lambda: 4 > cat) - self.assert_numpy_array_equal(cat == 4, [False, False, False]) - self.assert_numpy_array_equal(cat != 4, [True, True, True]) + self.assert_numpy_array_equal(cat == 4, + np.array([False, False, False])) + self.assert_numpy_array_equal(cat != 4, + np.array([True, True, True])) def test_map(self): c = pd.Categorical(list('ABABC'), categories=list('CBA'), @@ -1925,8 +1929,7 @@ def test_nan_handling(self): # Nans are represented as -1 in labels s = Series(Categorical(["a", "b", np.nan, "a"])) - self.assert_numpy_array_equal(s.cat.categories, - np.array(["a", "b"], dtype=np.object_)) + self.assert_index_equal(s.cat.categories, Index(["a", "b"])) self.assert_numpy_array_equal(s.values.codes, np.array([0, 1, -1, 0], dtype=np.int8)) @@ -1936,8 +1939,8 @@ def test_nan_handling(self): s2 = Series(Categorical(["a", "b", np.nan, "a"], categories=["a", "b", np.nan])) - exp_cat = np.array(["a", "b", np.nan], dtype=np.object_) - self.assert_numpy_array_equal(s2.cat.categories, exp_cat) + exp_cat = Index(["a", "b", np.nan]) + self.assert_index_equal(s2.cat.categories, exp_cat) self.assert_numpy_array_equal(s2.values.codes, np.array([0, 1, 2, 0], dtype=np.int8)) @@ -1946,24 +1949,26 @@ def test_nan_handling(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): s3.cat.categories = ["a", "b", np.nan] - exp_cat = np.array(["a", "b", np.nan], dtype=np.object_) - self.assert_numpy_array_equal(s3.cat.categories, exp_cat) + exp_cat = Index(["a", "b", np.nan]) + self.assert_index_equal(s3.cat.categories, exp_cat) self.assert_numpy_array_equal(s3.values.codes, np.array([0, 1, 2, 0], dtype=np.int8)) def test_cat_accessor(self): s = Series(Categorical(["a", "b", np.nan, "a"])) - self.assert_numpy_array_equal(s.cat.categories, np.array(["a", "b"])) + self.assert_index_equal(s.cat.categories, Index(["a", "b"])) self.assertEqual(s.cat.ordered, False) exp = Categorical(["a", "b", np.nan, "a"], categories=["b", "a"]) s.cat.set_categories(["b", "a"], inplace=True) - self.assertTrue(s.values.equals(exp)) + tm.assert_categorical_equal(s.values, exp) + res = s.cat.set_categories(["b", "a"]) - self.assertTrue(res.values.equals(exp)) + tm.assert_categorical_equal(res.values, exp) + exp = Categorical(["a", "b", np.nan, "a"], categories=["b", "a"]) s[:] = "a" s = s.cat.remove_unused_categories() - self.assert_numpy_array_equal(s.cat.categories, np.array(["a"])) + self.assert_index_equal(s.cat.categories, Index(["a"])) def test_sequence_like(self): @@ -2005,11 +2010,11 @@ def test_series_delegations(self): # and the methods '.set_categories()' 'drop_unused_categories()' to the # categorical s = Series(Categorical(["a", "b", "c", "a"], ordered=True)) - exp_categories = np.array(["a", "b", "c"]) - self.assert_numpy_array_equal(s.cat.categories, exp_categories) + exp_categories = Index(["a", "b", "c"]) + tm.assert_index_equal(s.cat.categories, exp_categories) s.cat.categories = [1, 2, 3] - exp_categories = np.array([1, 2, 3]) - self.assert_numpy_array_equal(s.cat.categories, exp_categories) + exp_categories = Index([1, 2, 3]) + self.assert_index_equal(s.cat.categories, exp_categories) exp_codes = Series([0, 1, 2, 0], dtype='int8') tm.assert_series_equal(s.cat.codes, exp_codes) @@ -2022,20 +2027,20 @@ def test_series_delegations(self): # reorder s = Series(Categorical(["a", "b", "c", "a"], ordered=True)) - exp_categories = np.array(["c", "b", "a"]) + exp_categories = Index(["c", "b", "a"]) exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_) s = s.cat.set_categories(["c", "b", "a"]) - self.assert_numpy_array_equal(s.cat.categories, exp_categories) + tm.assert_index_equal(s.cat.categories, exp_categories) self.assert_numpy_array_equal(s.values.__array__(), exp_values) self.assert_numpy_array_equal(s.__array__(), exp_values) # remove unused categories s = Series(Categorical(["a", "b", "b", "a"], categories=["a", "b", "c" ])) - exp_categories = np.array(["a", "b"], dtype=object) + exp_categories = Index(["a", "b"]) exp_values = np.array(["a", "b", "b", "a"], dtype=np.object_) s = s.cat.remove_unused_categories() - self.assert_numpy_array_equal(s.cat.categories, exp_categories) + self.assert_index_equal(s.cat.categories, exp_categories) self.assert_numpy_array_equal(s.values.__array__(), exp_values) self.assert_numpy_array_equal(s.__array__(), exp_values) @@ -2082,11 +2087,11 @@ def test_assignment_to_dataframe(self): result1 = df['D'] result2 = df['E'] - self.assertTrue(result1._data._block.values.equals(d)) + self.assert_categorical_equal(result1._data._block.values, d) # sorting s.name = 'E' - self.assertTrue(result2.sort_index().equals(s.sort_index())) + self.assert_series_equal(result2.sort_index(), s.sort_index()) cat = pd.Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10]) df = pd.DataFrame(pd.Series(cat)) @@ -2885,13 +2890,17 @@ def test_value_counts(self): categories=["c", "a", "b", "d"]) s = pd.Series(cats, name='xxx') res = s.value_counts(sort=False) - exp = Series([3, 1, 2, 0], name='xxx', - index=pd.CategoricalIndex(["c", "a", "b", "d"])) + + exp_index = pd.CategoricalIndex(["c", "a", "b", "d"], + categories=cats.categories) + exp = Series([3, 1, 2, 0], name='xxx', index=exp_index) tm.assert_series_equal(res, exp) res = s.value_counts(sort=True) - exp = Series([3, 2, 1, 0], name='xxx', - index=pd.CategoricalIndex(["c", "b", "a", "d"])) + + exp_index = pd.CategoricalIndex(["c", "b", "a", "d"], + categories=cats.categories) + exp = Series([3, 2, 1, 0], name='xxx', index=exp_index) tm.assert_series_equal(res, exp) # check object dtype handles the Series.name as the same @@ -2927,38 +2936,39 @@ def test_value_counts_with_nan(self): index=pd.CategoricalIndex(["a", "b", np.nan]))) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - s = pd.Series(pd.Categorical( - ["a", "b", "a"], categories=["a", "b", np.nan])) - tm.assert_series_equal( - s.value_counts(dropna=True), - pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"]))) - tm.assert_series_equal( - s.value_counts(dropna=False), - pd.Series([2, 1, 0], - index=pd.CategoricalIndex(["a", "b", np.nan]))) + s = pd.Series(pd.Categorical(["a", "b", "a"], + categories=["a", "b", np.nan])) + + # internal categories are different because of NaN + exp = pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"])) + tm.assert_series_equal(s.value_counts(dropna=True), exp, + check_categorical=False) + exp = pd.Series([2, 1, 0], + index=pd.CategoricalIndex(["a", "b", np.nan])) + tm.assert_series_equal(s.value_counts(dropna=False), exp, + check_categorical=False) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - s = pd.Series(pd.Categorical( - ["a", "b", None, "a", None, None], categories=["a", "b", np.nan - ])) - tm.assert_series_equal( - s.value_counts(dropna=True), - pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"]))) - tm.assert_series_equal( - s.value_counts(dropna=False), - pd.Series([3, 2, 1], - index=pd.CategoricalIndex([np.nan, "a", "b"]))) + s = pd.Series(pd.Categorical(["a", "b", None, "a", None, None], + categories=["a", "b", np.nan])) + + exp = pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"])) + tm.assert_series_equal(s.value_counts(dropna=True), exp, + check_categorical=False) + exp = pd.Series([3, 2, 1], + index=pd.CategoricalIndex([np.nan, "a", "b"])) + tm.assert_series_equal(s.value_counts(dropna=False), exp, + check_categorical=False) def test_groupby(self): - cats = Categorical( - ["a", "a", "a", "b", "b", "b", "c", "c", "c" - ], categories=["a", "b", "c", "d"], ordered=True) + cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], + categories=["a", "b", "c", "d"], ordered=True) data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) - expected = DataFrame({'a': Series( - [1, 2, 4, np.nan], index=pd.CategoricalIndex( - ['a', 'b', 'c', 'd'], name='b'))}) + exp_index = pd.CategoricalIndex(['a', 'b', 'c', 'd'], name='b', + ordered=True) + expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index) result = data.groupby("b").mean() tm.assert_frame_equal(result, expected) @@ -2970,17 +2980,19 @@ def test_groupby(self): # single grouper gb = df.groupby("A") - exp_idx = pd.CategoricalIndex(['a', 'b', 'z'], name='A') + exp_idx = pd.CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True) expected = DataFrame({'values': Series([3, 7, np.nan], index=exp_idx)}) result = gb.sum() tm.assert_frame_equal(result, expected) # multiple groupers gb = df.groupby(['A', 'B']) - expected = DataFrame({'values': Series( - [1, 2, np.nan, 3, 4, np.nan, np.nan, np.nan, np.nan - ], index=pd.MultiIndex.from_product( - [['a', 'b', 'z'], ['c', 'd', 'y']], names=['A', 'B']))}) + exp_index = pd.MultiIndex.from_product([['a', 'b', 'z'], + ['c', 'd', 'y']], + names=['A', 'B']) + expected = DataFrame({'values': [1, 2, np.nan, 3, 4, np.nan, + np.nan, np.nan, np.nan]}, + index=exp_index) result = gb.sum() tm.assert_frame_equal(result, expected) @@ -3025,8 +3037,7 @@ def f(x): c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) result = df.a.groupby(c).transform(sum) - tm.assert_series_equal(result, df['a'], check_names=False) - self.assertTrue(result.name is None) + tm.assert_series_equal(result, df['a']) tm.assert_series_equal( df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a']) @@ -3043,8 +3054,7 @@ def f(x): c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) result = df.a.groupby(c).transform(sum) - tm.assert_series_equal(result, df['a'], check_names=False) - self.assertTrue(result.name is None) + tm.assert_series_equal(result, df['a']) tm.assert_series_equal( df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a']) @@ -3056,8 +3066,10 @@ def f(x): df = pd.DataFrame({'a': [1, 0, 0, 0]}) c = pd.cut(df.a, [0, 1, 2, 3, 4]) result = df.groupby(c).apply(len) - expected = pd.Series([1, 0, 0, 0], - index=pd.CategoricalIndex(c.values.categories)) + + exp_index = pd.CategoricalIndex(c.values.categories, + ordered=c.values.ordered) + expected = pd.Series([1, 0, 0, 0], index=exp_index) expected.index.name = 'a' tm.assert_series_equal(result, expected) @@ -3135,7 +3147,7 @@ def test_sort_values(self): res = df.sort_values(by=["sort"], ascending=False) exp = df.sort_values(by=["string"], ascending=True) - self.assert_numpy_array_equal(res["values"], exp["values"]) + self.assert_series_equal(res["values"], exp["values"]) self.assertEqual(res["sort"].dtype, "category") self.assertEqual(res["unsort"].dtype, "category") @@ -3371,30 +3383,28 @@ def test_assigning_ops(self): # assign a part of a column with dtype != categorical -> # exp_parts_cats_col - cats = pd.Categorical( - ["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"]) + cats = pd.Categorical(["a", "a", "a", "a", "a", "a", "a"], + categories=["a", "b"]) idx = pd.Index(["h", "i", "j", "k", "l", "m", "n"]) values = [1, 1, 1, 1, 1, 1, 1] orig = pd.DataFrame({"cats": cats, "values": values}, index=idx) # the expected values # changed single row - cats1 = pd.Categorical( - ["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) + cats1 = pd.Categorical(["a", "a", "b", "a", "a", "a", "a"], + categories=["a", "b"]) idx1 = pd.Index(["h", "i", "j", "k", "l", "m", "n"]) values1 = [1, 1, 2, 1, 1, 1, 1] - exp_single_row = pd.DataFrame( - {"cats": cats1, - "values": values1}, index=idx1) + exp_single_row = pd.DataFrame({"cats": cats1, + "values": values1}, index=idx1) # changed multiple rows - cats2 = pd.Categorical( - ["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) + cats2 = pd.Categorical(["a", "a", "b", "b", "a", "a", "a"], + categories=["a", "b"]) idx2 = pd.Index(["h", "i", "j", "k", "l", "m", "n"]) values2 = [1, 1, 2, 2, 1, 1, 1] - exp_multi_row = pd.DataFrame( - {"cats": cats2, - "values": values2}, index=idx2) + exp_multi_row = pd.DataFrame({"cats": cats2, + "values": values2}, index=idx2) # changed part of the cats column cats3 = pd.Categorical( @@ -3655,7 +3665,8 @@ def f(): exp_fancy["cats"].cat.set_categories(["a", "b", "c"], inplace=True) df[df["cats"] == "c"] = ["b", 2] - tm.assert_frame_equal(df, exp_multi_row) + # category c is kept in .categories + tm.assert_frame_equal(df, exp_fancy) # set_value df = orig.copy() @@ -3710,7 +3721,7 @@ def f(): # ensure that one can set something to np.nan s = Series(Categorical([1, 2, 3])) - exp = Series(Categorical([1, np.nan, 3])) + exp = Series(Categorical([1, np.nan, 3], categories=[1, 2, 3])) s[1] = np.nan tm.assert_series_equal(s, exp) @@ -3890,15 +3901,15 @@ def f(): df1 = df[0:3] df2 = df[3:] - self.assert_numpy_array_equal(df['grade'].cat.categories, - df1['grade'].cat.categories) - self.assert_numpy_array_equal(df['grade'].cat.categories, - df2['grade'].cat.categories) + self.assert_index_equal(df['grade'].cat.categories, + df1['grade'].cat.categories) + self.assert_index_equal(df['grade'].cat.categories, + df2['grade'].cat.categories) dfx = pd.concat([df1, df2]) dfx['grade'].cat.categories - self.assert_numpy_array_equal(df['grade'].cat.categories, - dfx['grade'].cat.categories) + self.assert_index_equal(df['grade'].cat.categories, + dfx['grade'].cat.categories) def test_concat_preserve(self): @@ -4085,10 +4096,12 @@ def f(): c = Categorical(["a", "b", np.nan]) with tm.assert_produces_warning(FutureWarning): c.set_categories(["a", "b", np.nan], rename=True, inplace=True) + c[0] = np.nan df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]}) - df_exp = pd.DataFrame({"cats": Categorical(["a", "b", "a"]), - "vals": [1, 2, 3]}) + + cat_exp = Categorical(["a", "b", "a"], categories=["a", "b", np.nan]) + df_exp = pd.DataFrame({"cats": cat_exp, "vals": [1, 2, 3]}) res = df.fillna("a") tm.assert_frame_equal(res, df_exp) @@ -4130,7 +4143,9 @@ def cmp(a, b): ]: result = valid(s) - tm.assert_series_equal(result, s) + # compare series values + # internal .categories can't be compared because it is sorted + tm.assert_series_equal(result, s, check_categorical=False) # invalid conversion (these are NOT a dtype) for invalid in [lambda x: x.astype(pd.Categorical), diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 090669681fb4f..ad43dc1c09ef1 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -695,7 +695,7 @@ def test_random_state(): com._random_state(state2).uniform(), npr.RandomState(10).uniform()) # check with no arg random state - assert isinstance(com._random_state(), npr.RandomState) + assert com._random_state() is np.random # Error for floats or strings with tm.assertRaises(ValueError): @@ -817,6 +817,21 @@ def test_dict_compat(): assert (com._dict_compat(data_unchanged) == data_unchanged) +def test_is_timedelta(): + assert (com.is_timedelta64_dtype('timedelta64')) + assert (com.is_timedelta64_dtype('timedelta64[ns]')) + assert (not com.is_timedelta64_ns_dtype('timedelta64')) + assert (com.is_timedelta64_ns_dtype('timedelta64[ns]')) + + tdi = TimedeltaIndex([1e14, 2e14], dtype='timedelta64') + assert (com.is_timedelta64_dtype(tdi)) + assert (com.is_timedelta64_ns_dtype(tdi)) + assert (com.is_timedelta64_ns_dtype(tdi.astype('timedelta64[ns]'))) + # Conversion to Int64Index: + assert (not com.is_timedelta64_ns_dtype(tdi.astype('timedelta64'))) + assert (not com.is_timedelta64_ns_dtype(tdi.astype('timedelta64[h]'))) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index 044272f24a21f..cc0972937b8a2 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -15,10 +15,10 @@ from pandas import compat from pandas.util.testing import (assert_almost_equal, assert_series_equal, assert_frame_equal, assert_panel_equal, - assert_panel4d_equal) + assert_panel4d_equal, slow) from pandas.formats.printing import pprint_thing import pandas.util.testing as tm -from numpy.testing.decorators import slow + if not expr._USE_NUMEXPR: try: @@ -287,7 +287,12 @@ def testit(): use_numexpr=True) expected = expr.evaluate(op, op_str, f, f, use_numexpr=False) - tm.assert_numpy_array_equal(result, expected.values) + + if isinstance(result, DataFrame): + tm.assert_frame_equal(result, expected) + else: + tm.assert_numpy_array_equal(result, + expected.values) result = expr._can_use_numexpr(op, op_str, f2, f2, 'evaluate') @@ -325,7 +330,10 @@ def testit(): use_numexpr=True) expected = expr.evaluate(op, op_str, f11, f12, use_numexpr=False) - tm.assert_numpy_array_equal(result, expected.values) + if isinstance(result, DataFrame): + tm.assert_frame_equal(result, expected) + else: + tm.assert_numpy_array_equal(result, expected.values) result = expr._can_use_numexpr(op, op_str, f21, f22, 'evaluate') diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index ba282f0107d71..2f4c2b414cc30 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -21,8 +21,7 @@ assert_frame_equal, assert_panel_equal, assert_panel4d_equal, - assert_almost_equal, - assert_equal) + assert_almost_equal) import pandas.util.testing as tm @@ -415,6 +414,14 @@ def test_sample(self): o.sample(frac=0.7, random_state=np.random.RandomState(test)), o.sample(frac=0.7, random_state=np.random.RandomState(test))) + os1, os2 = [], [] + for _ in range(2): + np.random.seed(test) + os1.append(o.sample(n=4)) + os2.append(o.sample(frac=0.7)) + self._compare(*os1) + self._compare(*os2) + # Check for error when random_state argument invalid. with tm.assertRaises(ValueError): o.sample(random_state='astring!') @@ -839,7 +846,7 @@ def test_to_xarray(self): assert_almost_equal(list(result.coords.keys()), ['foo']) self.assertIsInstance(result, DataArray) - def testit(index, check_index_type=True): + def testit(index, check_index_type=True, check_categorical=True): s = Series(range(6), index=index(6)) s.index.name = 'foo' result = s.to_xarray() @@ -851,7 +858,8 @@ def testit(index, check_index_type=True): # idempotency assert_series_equal(result.to_series(), s, - check_index_type=check_index_type) + check_index_type=check_index_type, + check_categorical=check_categorical) for index in [tm.makeFloatIndex, tm.makeIntIndex, tm.makeStringIndex, tm.makeUnicodeIndex, @@ -860,7 +868,8 @@ def testit(index, check_index_type=True): testit(index) # not idempotent - testit(tm.makeCategoricalIndex, check_index_type=False) + testit(tm.makeCategoricalIndex, check_index_type=False, + check_categorical=False) s = Series(range(6)) s.index.name = 'foo' @@ -987,6 +996,59 @@ def test_describe_percentiles_insert_median(self): self.assertTrue('0%' in d1.index) self.assertTrue('100%' in d2.index) + def test_describe_percentiles_unique(self): + # GH13104 + df = tm.makeDataFrame() + with self.assertRaises(ValueError): + df.describe(percentiles=[0.1, 0.2, 0.4, 0.5, 0.2, 0.6]) + with self.assertRaises(ValueError): + df.describe(percentiles=[0.1, 0.2, 0.4, 0.2, 0.6]) + + def test_describe_percentiles_formatting(self): + # GH13104 + df = tm.makeDataFrame() + + # default + result = df.describe().index + expected = Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', + 'max'], + dtype='object') + tm.assert_index_equal(result, expected) + + result = df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, + 0.9995, 0.9999]).index + expected = Index(['count', 'mean', 'std', 'min', '0.01%', '0.05%', + '0.1%', '50%', '99.9%', '99.95%', '99.99%', 'max'], + dtype='object') + tm.assert_index_equal(result, expected) + + result = df.describe(percentiles=[0.00499, 0.005, 0.25, 0.50, + 0.75]).index + expected = Index(['count', 'mean', 'std', 'min', '0.499%', '0.5%', + '25%', '50%', '75%', 'max'], + dtype='object') + tm.assert_index_equal(result, expected) + + result = df.describe(percentiles=[0.00499, 0.01001, 0.25, 0.50, + 0.75]).index + expected = Index(['count', 'mean', 'std', 'min', '0.5%', '1.0%', + '25%', '50%', '75%', 'max'], + dtype='object') + tm.assert_index_equal(result, expected) + + def test_describe_column_index_type(self): + # GH13288 + df = pd.DataFrame([1, 2, 3, 4]) + df.columns = pd.Index([0], dtype=object) + result = df.describe().columns + expected = Index([0], dtype=object) + tm.assert_index_equal(result, expected) + + df = pd.DataFrame({'A': list("BCDE"), 0: [1, 2, 3, 4]}) + result = df.describe().columns + expected = Index([0], dtype=object) + tm.assert_index_equal(result, expected) + def test_describe_no_numeric(self): df = DataFrame({'A': ['foo', 'foo', 'bar'] * 8, 'B': ['a', 'b', 'c', 'd'] * 6}) @@ -1001,6 +1063,16 @@ def test_describe_no_numeric(self): desc = df.describe() self.assertEqual(desc.time['first'], min(ts.index)) + def test_describe_empty(self): + df = DataFrame() + tm.assertRaisesRegexp(ValueError, 'DataFrame without columns', + df.describe) + + df = DataFrame(columns=['A', 'B']) + result = df.describe() + expected = DataFrame(0, columns=['A', 'B'], index=['count', 'unique']) + tm.assert_frame_equal(result, expected) + def test_describe_empty_int_columns(self): df = DataFrame([[0, 1], [1, 2]]) desc = df[df[0] < 0].describe() # works @@ -1280,7 +1352,7 @@ def test_tz_convert_and_localize(self): df1 = DataFrame(np.ones(5), index=l0) df1 = getattr(df1, fn)('US/Pacific') - self.assertTrue(df1.index.equals(l0_expected)) + self.assert_index_equal(df1.index, l0_expected) # MultiIndex # GH7846 @@ -1288,14 +1360,14 @@ def test_tz_convert_and_localize(self): df3 = getattr(df2, fn)('US/Pacific', level=0) self.assertFalse(df3.index.levels[0].equals(l0)) - self.assertTrue(df3.index.levels[0].equals(l0_expected)) - self.assertTrue(df3.index.levels[1].equals(l1)) + self.assert_index_equal(df3.index.levels[0], l0_expected) + self.assert_index_equal(df3.index.levels[1], l1) self.assertFalse(df3.index.levels[1].equals(l1_expected)) df3 = getattr(df2, fn)('US/Pacific', level=1) - self.assertTrue(df3.index.levels[0].equals(l0)) + self.assert_index_equal(df3.index.levels[0], l0) self.assertFalse(df3.index.levels[0].equals(l0_expected)) - self.assertTrue(df3.index.levels[1].equals(l1_expected)) + self.assert_index_equal(df3.index.levels[1], l1_expected) self.assertFalse(df3.index.levels[1].equals(l1)) df4 = DataFrame(np.ones(5), @@ -1304,9 +1376,9 @@ def test_tz_convert_and_localize(self): # TODO: untested df5 = getattr(df4, fn)('US/Pacific', level=1) # noqa - self.assertTrue(df3.index.levels[0].equals(l0)) + self.assert_index_equal(df3.index.levels[0], l0) self.assertFalse(df3.index.levels[0].equals(l0_expected)) - self.assertTrue(df3.index.levels[1].equals(l1_expected)) + self.assert_index_equal(df3.index.levels[1], l1_expected) self.assertFalse(df3.index.levels[1].equals(l1)) # Bad Inputs @@ -1336,7 +1408,7 @@ def test_set_attribute(self): df['y'] = [2, 4, 6] df.y = 5 - assert_equal(df.y, 5) + self.assertEqual(df.y, 5) assert_series_equal(df['y'], Series([2, 4, 6], name='y')) def test_pct_change(self): @@ -1401,9 +1473,8 @@ def test_to_xarray(self): expected['f'] = expected['f'].astype(object) expected['h'] = expected['h'].astype('datetime64[ns]') expected.columns.name = None - assert_frame_equal(result.to_dataframe(), - expected, - check_index_type=False) + assert_frame_equal(result.to_dataframe(), expected, + check_index_type=False, check_categorical=False) # available in 0.7.1 # MultiIndex diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index 3820a9d5f6476..b09185c19bffb 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -19,7 +19,7 @@ import pandas.core.common as com import pandas.util.testing as tm from pandas.util.testing import (ensure_clean, - assert_is_valid_plot_return_object) + assert_is_valid_plot_return_object, slow) from pandas.core.config import set_option @@ -27,8 +27,6 @@ from numpy import random from numpy.random import rand, randn -from numpy.testing import assert_allclose -from numpy.testing.decorators import slow import pandas.tools.plotting as plotting """ These tests are for ``Dataframe.plot`` and ``Series.plot``. @@ -140,7 +138,7 @@ def _check_data(self, xp, rs): def check_line(xpl, rsl): xpdata = xpl.get_xydata() rsdata = rsl.get_xydata() - assert_allclose(xpdata, rsdata) + tm.assert_almost_equal(xpdata, rsdata) self.assertEqual(len(xp_lines), len(rs_lines)) [check_line(xpl, rsl) for xpl, rsl in zip(xp_lines, rs_lines)] @@ -708,14 +706,12 @@ def test_bar_log(self): expected = np.hstack((1.0e-04, expected, 1.0e+01)) ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='bar') - tm.assert_numpy_array_equal(ax.get_ylim(), - (0.001, 0.10000000000000001)) + self.assertEqual(ax.get_ylim(), (0.001, 0.10000000000000001)) tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected) tm.close() ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='barh') - tm.assert_numpy_array_equal(ax.get_xlim(), - (0.001, 0.10000000000000001)) + self.assertEqual(ax.get_xlim(), (0.001, 0.10000000000000001)) tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), expected) @slow @@ -2207,11 +2203,11 @@ def test_scatter_colors(self): ax = df.plot.scatter(x='a', y='b', c='c') tm.assert_numpy_array_equal(ax.collections[0].get_facecolor()[0], - (0, 0, 1, 1)) + np.array([0, 0, 1, 1], dtype=np.float64)) ax = df.plot.scatter(x='a', y='b', color='white') tm.assert_numpy_array_equal(ax.collections[0].get_facecolor()[0], - (1, 1, 1, 1)) + np.array([1, 1, 1, 1], dtype=np.float64)) @slow def test_plot_bar(self): diff --git a/pandas/tests/test_graphics_others.py b/pandas/tests/test_graphics_others.py index b032ce196c113..7285d84865542 100644 --- a/pandas/tests/test_graphics_others.py +++ b/pandas/tests/test_graphics_others.py @@ -11,12 +11,12 @@ from pandas import Series, DataFrame, MultiIndex from pandas.compat import range, lmap, lzip import pandas.util.testing as tm +from pandas.util.testing import slow import numpy as np from numpy import random from numpy.random import randn -from numpy.testing.decorators import slow import pandas.tools.plotting as plotting from pandas.tests.test_graphics import (TestPlotBase, _check_plot_works, diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 5bd5c80f18386..6659e6b106a67 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -8,6 +8,7 @@ from pandas import date_range, bdate_range, Timestamp from pandas.core.index import Index, MultiIndex, CategoricalIndex from pandas.core.api import Categorical, DataFrame +from pandas.core.common import UnsupportedFunctionCall from pandas.core.groupby import (SpecificationError, DataError, _nargsort, _lexsort_indexer) from pandas.core.series import Series @@ -30,7 +31,6 @@ import pandas.util.testing as tm import pandas as pd -from numpy.testing import assert_equal class TestGroupBy(tm.TestCase): @@ -774,11 +774,11 @@ def test_agg_apply_corner(self): # DataFrame grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan) exp_df = DataFrame(columns=self.tsframe.columns, dtype=float, - index=pd.Index( - [], dtype=np.float64)) + index=pd.Index([], dtype=np.float64)) assert_frame_equal(grouped.sum(), exp_df, check_names=False) assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False) - assert_frame_equal(grouped.apply(np.sum), DataFrame({}, dtype=float)) + assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], + check_names=False) def test_agg_grouping_is_list_tuple(self): from pandas.core.groupby import Grouping @@ -1051,24 +1051,50 @@ def test_transform_fast(self): values = np.repeat(grp.mean().values, com._ensure_platform_int(grp.count().values)) - expected = pd.Series(values, index=df.index) + expected = pd.Series(values, index=df.index, name='val') result = grp.transform(np.mean) assert_series_equal(result, expected) result = grp.transform('mean') assert_series_equal(result, expected) + # GH 12737 + df = pd.DataFrame({'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5], + 'd': pd.date_range('2014-1-1', '2014-1-4'), + 'i': [1, 2, 3, 4]}, + columns=['grouping', 'f', 'i', 'd']) + result = df.groupby('grouping').transform('first') + + dates = [pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'), + pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4')] + expected = pd.DataFrame({'f': [1.1, 2.1, 2.1, 4.5], + 'd': dates, + 'i': [1, 2, 2, 4]}, + columns=['f', 'i', 'd']) + assert_frame_equal(result, expected) + + # selection + result = df.groupby('grouping')[['f', 'i']].transform('first') + expected = expected[['f', 'i']] + assert_frame_equal(result, expected) + + # dup columns + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a']) + result = df.groupby('g').transform('first') + expected = df.drop('g', axis=1) + assert_frame_equal(result, expected) + def test_transform_broadcast(self): grouped = self.ts.groupby(lambda x: x.month) result = grouped.transform(np.mean) - self.assertTrue(result.index.equals(self.ts.index)) + self.assert_index_equal(result.index, self.ts.index) for _, gp in grouped: assert_fp_equal(result.reindex(gp.index), gp.mean()) grouped = self.tsframe.groupby(lambda x: x.month) result = grouped.transform(np.mean) - self.assertTrue(result.index.equals(self.tsframe.index)) + self.assert_index_equal(result.index, self.tsframe.index) for _, gp in grouped: agged = gp.mean() res = result.reindex(gp.index) @@ -1079,8 +1105,8 @@ def test_transform_broadcast(self): grouped = self.tsframe.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1}, axis=1) result = grouped.transform(np.mean) - self.assertTrue(result.index.equals(self.tsframe.index)) - self.assertTrue(result.columns.equals(self.tsframe.columns)) + self.assert_index_equal(result.index, self.tsframe.index) + self.assert_index_equal(result.columns, self.tsframe.columns) for _, gp in grouped: agged = gp.mean(1) res = result.reindex(columns=gp.columns) @@ -1191,6 +1217,16 @@ def test_transform_function_aliases(self): expected = self.df.groupby('A')['C'].transform(np.mean) assert_series_equal(result, expected) + def test_series_fast_transform_date(self): + # GH 13191 + df = pd.DataFrame({'grouping': [np.nan, 1, 1, 3], + 'd': pd.date_range('2014-1-1', '2014-1-4')}) + result = df.groupby('grouping')['d'].transform('first') + dates = [pd.NaT, pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-2'), + pd.Timestamp('2014-1-4')] + expected = pd.Series(dates, name='d') + assert_series_equal(result, expected) + def test_transform_length(self): # GH 9697 df = pd.DataFrame({'col1': [1, 1, 2, 2], 'col2': [1, 2, 3, np.nan]}) @@ -2101,7 +2137,7 @@ def test_groupby_multiple_key(self): lambda x: x.day], axis=1) agged = grouped.agg(lambda x: x.sum()) - self.assertTrue(agged.index.equals(df.columns)) + self.assert_index_equal(agged.index, df.columns) assert_almost_equal(df.T.values, agged.values) agged = grouped.agg(lambda x: x.sum()) @@ -2513,7 +2549,7 @@ def f(piece): result = grouped.apply(f) tm.assertIsInstance(result, DataFrame) - self.assertTrue(result.index.equals(ts.index)) + self.assert_index_equal(result.index, ts.index) def test_apply_series_yield_constant(self): result = self.df.groupby(['A', 'B'])['C'].apply(len) @@ -2523,7 +2559,7 @@ def test_apply_frame_to_series(self): grouped = self.df.groupby(['A', 'B']) result = grouped.apply(len) expected = grouped.count()['C'] - self.assertTrue(result.index.equals(expected.index)) + self.assert_index_equal(result.index, expected.index) self.assert_numpy_array_equal(result.values, expected.values) def test_apply_frame_concat_series(self): @@ -2637,26 +2673,26 @@ def test_groupby_with_hier_columns(self): df = DataFrame(np.random.randn(8, 4), index=index, columns=columns) result = df.groupby(level=0).mean() - self.assertTrue(result.columns.equals(columns)) + self.assert_index_equal(result.columns, columns) result = df.groupby(level=0, axis=1).mean() - self.assertTrue(result.index.equals(df.index)) + self.assert_index_equal(result.index, df.index) result = df.groupby(level=0).agg(np.mean) - self.assertTrue(result.columns.equals(columns)) + self.assert_index_equal(result.columns, columns) result = df.groupby(level=0).apply(lambda x: x.mean()) - self.assertTrue(result.columns.equals(columns)) + self.assert_index_equal(result.columns, columns) result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1)) - self.assertTrue(result.columns.equals(Index(['A', 'B']))) - self.assertTrue(result.index.equals(df.index)) + self.assert_index_equal(result.columns, Index(['A', 'B'])) + self.assert_index_equal(result.index, df.index) # add a nuisance column sorted_columns, _ = columns.sortlevel(0) df['A', 'foo'] = 'bar' result = df.groupby(level=0).mean() - self.assertTrue(result.columns.equals(df.columns[:-1])) + self.assert_index_equal(result.columns, df.columns[:-1]) def test_pass_args_kwargs(self): from numpy import percentile @@ -2676,7 +2712,7 @@ def f(x, q=None, axis=0): trans_expected = ts_grouped.transform(g) assert_series_equal(apply_result, agg_expected) - assert_series_equal(agg_result, agg_expected) + assert_series_equal(agg_result, agg_expected, check_names=False) assert_series_equal(trans_result, trans_expected) agg_result = ts_grouped.agg(f, q=80) @@ -2692,11 +2728,11 @@ def f(x, q=None, axis=0): apply_result = df_grouped.apply(DataFrame.quantile, .8) expected = df_grouped.quantile(.8) assert_frame_equal(apply_result, expected) - assert_frame_equal(agg_result, expected) + assert_frame_equal(agg_result, expected, check_names=False) agg_result = df_grouped.agg(f, q=80) apply_result = df_grouped.apply(DataFrame.quantile, q=.8) - assert_frame_equal(agg_result, expected) + assert_frame_equal(agg_result, expected, check_names=False) assert_frame_equal(apply_result, expected) def test_size(self): @@ -3377,18 +3413,18 @@ def test_panel_groupby(self): tm.assert_panel_equal(agged, agged2) - self.assert_numpy_array_equal(agged.items, [0, 1]) + self.assert_index_equal(agged.items, Index([0, 1])) grouped = self.panel.groupby(lambda x: x.month, axis='major') agged = grouped.mean() - self.assert_numpy_array_equal(agged.major_axis, sorted(list(set( - self.panel.major_axis.month)))) + exp = Index(sorted(list(set(self.panel.major_axis.month)))) + self.assert_index_equal(agged.major_axis, exp) grouped = self.panel.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1}, axis='minor') agged = grouped.mean() - self.assert_numpy_array_equal(agged.minor_axis, [0, 1]) + self.assert_index_equal(agged.minor_axis, Index([0, 1])) def test_numpy_groupby(self): from pandas.core.groupby import numpy_groupby @@ -3414,7 +3450,7 @@ def test_groupby_2d_malformed(self): d['label'] = ['l1', 'l2'] tmp = d.groupby(['group']).mean() res_values = np.array([[0, 1], [0, 1]], dtype=np.int64) - self.assert_numpy_array_equal(tmp.columns, ['zeros', 'ones']) + self.assert_index_equal(tmp.columns, Index(['zeros', 'ones'])) self.assert_numpy_array_equal(tmp.values, res_values) def test_int32_overflow(self): @@ -3453,10 +3489,10 @@ def test_int64_overflow(self): right = rg.sum()['values'] exp_index, _ = left.index.sortlevel(0) - self.assertTrue(left.index.equals(exp_index)) + self.assert_index_equal(left.index, exp_index) exp_index, _ = right.index.sortlevel(0) - self.assertTrue(right.index.equals(exp_index)) + self.assert_index_equal(right.index, exp_index) tups = list(map(tuple, df[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H' ]].values)) @@ -3684,9 +3720,9 @@ def test_agg_multiple_functions_maintain_order(self): # GH #610 funcs = [('mean', np.mean), ('max', np.max), ('min', np.min)] result = self.df.groupby('A')['C'].agg(funcs) - exp_cols = ['mean', 'max', 'min'] + exp_cols = Index(['mean', 'max', 'min']) - self.assert_numpy_array_equal(result.columns, exp_cols) + self.assert_index_equal(result.columns, exp_cols) def test_multiple_functions_tuples_and_non_tuples(self): # #1359 @@ -3831,8 +3867,8 @@ def test_groupby_sort_categorical(self): ['(0, 2.5]', 1, 60], ['(5, 7.5]', 7, 70]], columns=['range', 'foo', 'bar']) df['range'] = Categorical(df['range'], ordered=True) - index = CategoricalIndex( - ['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', '(7.5, 10]'], name='range') + index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', + '(7.5, 10]'], name='range', ordered=True) result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'], index=index) @@ -3842,13 +3878,15 @@ def test_groupby_sort_categorical(self): assert_frame_equal(result_sort, df.groupby(col, sort=False).first()) df['range'] = Categorical(df['range'], ordered=False) - index = CategoricalIndex( - ['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', '(7.5, 10]'], name='range') + index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', + '(7.5, 10]'], name='range') result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'], index=index) - index = CategoricalIndex(['(7.5, 10]', '(2.5, 5]', - '(5, 7.5]', '(0, 2.5]'], + index = CategoricalIndex(['(7.5, 10]', '(2.5, 5]', '(5, 7.5]', + '(0, 2.5]'], + categories=['(7.5, 10]', '(2.5, 5]', + '(5, 7.5]', '(0, 2.5]'], name='range') result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], index=index, columns=['foo', 'bar']) @@ -3938,7 +3976,8 @@ def test_groupby_categorical(self): result = data.groupby(cats).mean() expected = data.groupby(np.asarray(cats)).mean() - exp_idx = CategoricalIndex(levels, ordered=True) + exp_idx = CategoricalIndex(levels, categories=cats.categories, + ordered=True) expected = expected.reindex(exp_idx) assert_frame_equal(result, expected) @@ -3949,14 +3988,16 @@ def test_groupby_categorical(self): idx = cats.codes.argsort() ord_labels = np.asarray(cats).take(idx) ord_data = data.take(idx) - expected = ord_data.groupby( - Categorical(ord_labels), sort=False).describe() + + exp_cats = Categorical(ord_labels, ordered=True, + categories=['foo', 'bar', 'baz', 'qux']) + expected = ord_data.groupby(exp_cats, sort=False).describe() expected.index.names = [None, None] assert_frame_equal(desc_result, expected) # GH 10460 - expc = Categorical.from_codes( - np.arange(4).repeat(8), levels, ordered=True) + expc = Categorical.from_codes(np.arange(4).repeat(8), + levels, ordered=True) exp = CategoricalIndex(expc) self.assert_index_equal(desc_result.index.get_level_values(0), exp) exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', @@ -4234,10 +4275,10 @@ def test_multiindex_columns_empty_level(self): df = DataFrame([[long(1), 'A']], columns=midx) grouped = df.groupby('to filter').groups - self.assert_numpy_array_equal(grouped['A'], [0]) + self.assertEqual(grouped['A'], [0]) grouped = df.groupby([('to filter', '')]).groups - self.assert_numpy_array_equal(grouped['A'], [0]) + self.assertEqual(grouped['A'], [0]) df = DataFrame([[long(1), 'A'], [long(2), 'B']], columns=midx) @@ -4406,7 +4447,7 @@ def test_groupby_datetime64_32_bit(self): df = DataFrame({"A": range(2), "B": [pd.Timestamp('2000-01-1')] * 2}) result = df.groupby("A")["B"].transform(min) - expected = Series([pd.Timestamp('2000-01-1')] * 2) + expected = Series([pd.Timestamp('2000-01-1')] * 2, name='B') assert_series_equal(result, expected) def test_groupby_categorical_unequal_len(self): @@ -4508,7 +4549,7 @@ def test_groupby_with_empty(self): grouped = series.groupby(grouper) assert next(iter(grouped), None) is None - def test_aaa_groupby_with_small_elem(self): + def test_groupby_with_small_elem(self): # GH 8542 # length=2 df = pd.DataFrame({'event': ['start', 'start'], @@ -4579,10 +4620,10 @@ def test_timezone_info(self): import pytz df = pd.DataFrame({'a': [1], 'b': [datetime.now(pytz.utc)]}) - tm.assert_equal(df['b'][0].tzinfo, pytz.utc) + self.assertEqual(df['b'][0].tzinfo, pytz.utc) df = pd.DataFrame({'a': [1, 2, 3]}) df['b'] = datetime.now(pytz.utc) - tm.assert_equal(df['b'][0].tzinfo, pytz.utc) + self.assertEqual(df['b'][0].tzinfo, pytz.utc) def test_groupby_with_timegrouper(self): # GH 4161 @@ -5812,25 +5853,23 @@ def test_lexsort_indexer(self): keys = [[nan] * 5 + list(range(100)) + [nan] * 5] # orders=True, na_position='last' result = _lexsort_indexer(keys, orders=True, na_position='last') - expected = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - assert_equal(result, expected) + exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp)) # orders=True, na_position='first' result = _lexsort_indexer(keys, orders=True, na_position='first') - expected = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - assert_equal(result, expected) + exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) + tm.assert_numpy_array_equal(result, np.array(exp)) # orders=False, na_position='last' result = _lexsort_indexer(keys, orders=False, na_position='last') - expected = list(range(104, 4, -1)) + list(range(5)) + list(range(105, - 110)) - assert_equal(result, expected) + exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp)) # orders=False, na_position='first' result = _lexsort_indexer(keys, orders=False, na_position='first') - expected = list(range(5)) + list(range(105, 110)) + list(range(104, 4, - -1)) - assert_equal(result, expected) + exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) + tm.assert_numpy_array_equal(result, np.array(exp)) def test_nargsort(self): # np.argsort(items) places NaNs last @@ -5856,54 +5895,50 @@ def test_nargsort(self): # mergesort, ascending=True, na_position='last' result = _nargsort(items, kind='mergesort', ascending=True, na_position='last') - expected = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - assert_equal(result, expected) + exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) # mergesort, ascending=True, na_position='first' result = _nargsort(items, kind='mergesort', ascending=True, na_position='first') - expected = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - assert_equal(result, expected) + exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) # mergesort, ascending=False, na_position='last' result = _nargsort(items, kind='mergesort', ascending=False, na_position='last') - expected = list(range(104, 4, -1)) + list(range(5)) + list(range(105, - 110)) - assert_equal(result, expected) + exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) # mergesort, ascending=False, na_position='first' result = _nargsort(items, kind='mergesort', ascending=False, na_position='first') - expected = list(range(5)) + list(range(105, 110)) + list(range(104, 4, - -1)) - assert_equal(result, expected) + exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) # mergesort, ascending=True, na_position='last' result = _nargsort(items2, kind='mergesort', ascending=True, na_position='last') - expected = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - assert_equal(result, expected) + exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) # mergesort, ascending=True, na_position='first' result = _nargsort(items2, kind='mergesort', ascending=True, na_position='first') - expected = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - assert_equal(result, expected) + exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) # mergesort, ascending=False, na_position='last' result = _nargsort(items2, kind='mergesort', ascending=False, na_position='last') - expected = list(range(104, 4, -1)) + list(range(5)) + list(range(105, - 110)) - assert_equal(result, expected) + exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) # mergesort, ascending=False, na_position='first' result = _nargsort(items2, kind='mergesort', ascending=False, na_position='first') - expected = list(range(5)) + list(range(105, 110)) + list(range(104, 4, - -1)) - assert_equal(result, expected) + exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) def test_datetime_count(self): df = DataFrame({'a': [1, 2, 3] * 2, @@ -5972,7 +6007,7 @@ def test__cython_agg_general(self): exc.args += ('operation: %s' % op, ) raise - def test_aa_cython_group_transform_algos(self): + def test_cython_group_transform_algos(self): # GH 4095 dtypes = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint32, np.uint64, np.float32, np.float64] @@ -6229,8 +6264,11 @@ def test_groupby_categorical_two_columns(self): # Grouping on a single column groups_single_key = test.groupby("cat") res = groups_single_key.agg('mean') + + exp_index = pd.CategoricalIndex(["a", "b", "c"], name="cat", + ordered=True) exp = DataFrame({"ints": [1.5, 1.5, np.nan], "val": [20, 30, np.nan]}, - index=pd.CategoricalIndex(["a", "b", "c"], name="cat")) + index=exp_index) tm.assert_frame_equal(res, exp) # Grouping on two columns @@ -6279,6 +6317,29 @@ def test_func(x): expected = DataFrame() tm.assert_frame_equal(result, expected) + def test_groupby_apply_none_first(self): + # GH 12824. Tests if apply returns None first. + test_df1 = DataFrame({'groups': [1, 1, 1, 2], 'vars': [0, 1, 2, 3]}) + test_df2 = DataFrame({'groups': [1, 2, 2, 2], 'vars': [0, 1, 2, 3]}) + + def test_func(x): + if x.shape[0] < 2: + return None + return x.iloc[[0, -1]] + + result1 = test_df1.groupby('groups').apply(test_func) + result2 = test_df2.groupby('groups').apply(test_func) + index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], + names=['groups', None]) + index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], + names=['groups', None]) + expected1 = DataFrame({'groups': [1, 1], 'vars': [0, 2]}, + index=index1) + expected2 = DataFrame({'groups': [2, 2], 'vars': [1, 3]}, + index=index2) + tm.assert_frame_equal(result1, expected1) + tm.assert_frame_equal(result2, expected2) + def test_first_last_max_min_on_time_data(self): # GH 10295 # Verify that NaT is not in the result of max, min, first and last on @@ -6357,6 +6418,19 @@ def test_transform_with_non_scalar_group(self): (axis=1, level=1).transform, lambda z: z.div(z.sum(axis=1), axis=0)) + def test_numpy_compat(self): + # see gh-12811 + df = pd.DataFrame({'A': [1, 2, 1], 'B': [1, 2, 3]}) + g = df.groupby('A') + + msg = "numpy operations are not valid with groupby" + + for func in ('mean', 'var', 'std', 'cumprod', 'cumsum'): + tm.assertRaisesRegexp(UnsupportedFunctionCall, msg, + getattr(g, func), 1, 2, 3) + tm.assertRaisesRegexp(UnsupportedFunctionCall, msg, + getattr(g, func), foo=1) + def assert_fp_equal(a, b): assert (np.abs(a - b) < 1e-12).all() diff --git a/pandas/tests/test_infer_and_convert.py b/pandas/tests/test_infer_and_convert.py new file mode 100644 index 0000000000000..a6941369b35be --- /dev/null +++ b/pandas/tests/test_infer_and_convert.py @@ -0,0 +1,444 @@ +# -*- coding: utf-8 -*- + +from datetime import datetime, timedelta, date, time + +import numpy as np +import pandas as pd +import pandas.lib as lib +import pandas.util.testing as tm +from pandas import Index + +from pandas.compat import long, u, PY2 + + +class TestInference(tm.TestCase): + + def test_infer_dtype_bytes(self): + compare = 'string' if PY2 else 'bytes' + + # string array of bytes + arr = np.array(list('abc'), dtype='S1') + self.assertEqual(pd.lib.infer_dtype(arr), compare) + + # object array of bytes + arr = arr.astype(object) + self.assertEqual(pd.lib.infer_dtype(arr), compare) + + def test_isinf_scalar(self): + # GH 11352 + self.assertTrue(lib.isposinf_scalar(float('inf'))) + self.assertTrue(lib.isposinf_scalar(np.inf)) + self.assertFalse(lib.isposinf_scalar(-np.inf)) + self.assertFalse(lib.isposinf_scalar(1)) + self.assertFalse(lib.isposinf_scalar('a')) + + self.assertTrue(lib.isneginf_scalar(float('-inf'))) + self.assertTrue(lib.isneginf_scalar(-np.inf)) + self.assertFalse(lib.isneginf_scalar(np.inf)) + self.assertFalse(lib.isneginf_scalar(1)) + self.assertFalse(lib.isneginf_scalar('a')) + + def test_maybe_convert_numeric_infinities(self): + # see gh-13274 + infinities = ['inf', 'inF', 'iNf', 'Inf', + 'iNF', 'InF', 'INf', 'INF'] + na_values = set(['', 'NULL', 'nan']) + + pos = np.array(['inf'], dtype=np.float64) + neg = np.array(['-inf'], dtype=np.float64) + + msg = "Unable to parse string" + + for infinity in infinities: + for maybe_int in (True, False): + out = lib.maybe_convert_numeric( + np.array([infinity], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, pos) + + out = lib.maybe_convert_numeric( + np.array(['-' + infinity], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, neg) + + out = lib.maybe_convert_numeric( + np.array([u(infinity)], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, pos) + + out = lib.maybe_convert_numeric( + np.array(['+' + infinity], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, pos) + + # too many characters + with tm.assertRaisesRegexp(ValueError, msg): + lib.maybe_convert_numeric( + np.array(['foo_' + infinity], dtype=object), + na_values, maybe_int) + + def test_maybe_convert_numeric_post_floatify_nan(self): + # see gh-13314 + data = np.array(['1.200', '-999.000', '4.500'], dtype=object) + expected = np.array([1.2, np.nan, 4.5], dtype=np.float64) + nan_values = set([-999, -999.0]) + + for coerce_type in (True, False): + out = lib.maybe_convert_numeric(data, nan_values, coerce_type) + tm.assert_numpy_array_equal(out, expected) + + def test_convert_infs(self): + arr = np.array(['inf', 'inf', 'inf'], dtype='O') + result = lib.maybe_convert_numeric(arr, set(), False) + self.assertTrue(result.dtype == np.float64) + + arr = np.array(['-inf', '-inf', '-inf'], dtype='O') + result = lib.maybe_convert_numeric(arr, set(), False) + self.assertTrue(result.dtype == np.float64) + + def test_scientific_no_exponent(self): + # See PR 12215 + arr = np.array(['42E', '2E', '99e', '6e'], dtype='O') + result = lib.maybe_convert_numeric(arr, set(), False, True) + self.assertTrue(np.all(np.isnan(result))) + + def test_convert_non_hashable(self): + # GH13324 + # make sure that we are handing non-hashables + arr = np.array([[10.0, 2], 1.0, 'apple']) + result = lib.maybe_convert_numeric(arr, set(), False, True) + tm.assert_numpy_array_equal(result, np.array([np.nan, 1.0, np.nan])) + + +class TestTypeInference(tm.TestCase): + _multiprocess_can_split_ = True + + def test_length_zero(self): + result = lib.infer_dtype(np.array([], dtype='i4')) + self.assertEqual(result, 'integer') + + result = lib.infer_dtype([]) + self.assertEqual(result, 'empty') + + def test_integers(self): + arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'integer') + + arr = np.array([1, 2, 3, np.int64(4), np.int32(5), 'foo'], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'mixed-integer') + + arr = np.array([1, 2, 3, 4, 5], dtype='i4') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'integer') + + def test_bools(self): + arr = np.array([True, False, True, True, True], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'boolean') + + arr = np.array([np.bool_(True), np.bool_(False)], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'boolean') + + arr = np.array([True, False, True, 'foo'], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'mixed') + + arr = np.array([True, False, True], dtype=bool) + result = lib.infer_dtype(arr) + self.assertEqual(result, 'boolean') + + def test_floats(self): + arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'floating') + + arr = np.array([1, 2, 3, np.float64(4), np.float32(5), 'foo'], + dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'mixed-integer') + + arr = np.array([1, 2, 3, 4, 5], dtype='f4') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'floating') + + arr = np.array([1, 2, 3, 4, 5], dtype='f8') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'floating') + + def test_string(self): + pass + + def test_unicode(self): + pass + + def test_datetime(self): + + dates = [datetime(2012, 1, x) for x in range(1, 20)] + index = Index(dates) + self.assertEqual(index.inferred_type, 'datetime64') + + def test_date(self): + + dates = [date(2012, 1, x) for x in range(1, 20)] + index = Index(dates) + self.assertEqual(index.inferred_type, 'date') + + def test_to_object_array_tuples(self): + r = (5, 6) + values = [r] + result = lib.to_object_array_tuples(values) + + try: + # make sure record array works + from collections import namedtuple + record = namedtuple('record', 'x y') + r = record(5, 6) + values = [r] + result = lib.to_object_array_tuples(values) # noqa + except ImportError: + pass + + def test_to_object_array_width(self): + # see gh-13320 + rows = [[1, 2, 3], [4, 5, 6]] + + expected = np.array(rows, dtype=object) + out = lib.to_object_array(rows) + tm.assert_numpy_array_equal(out, expected) + + expected = np.array(rows, dtype=object) + out = lib.to_object_array(rows, min_width=1) + tm.assert_numpy_array_equal(out, expected) + + expected = np.array([[1, 2, 3, None, None], + [4, 5, 6, None, None]], dtype=object) + out = lib.to_object_array(rows, min_width=5) + tm.assert_numpy_array_equal(out, expected) + + def test_object(self): + + # GH 7431 + # cannot infer more than this as only a single element + arr = np.array([None], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'mixed') + + def test_categorical(self): + + # GH 8974 + from pandas import Categorical, Series + arr = Categorical(list('abc')) + result = lib.infer_dtype(arr) + self.assertEqual(result, 'categorical') + + result = lib.infer_dtype(Series(arr)) + self.assertEqual(result, 'categorical') + + arr = Categorical(list('abc'), categories=['cegfab'], ordered=True) + result = lib.infer_dtype(arr) + self.assertEqual(result, 'categorical') + + result = lib.infer_dtype(Series(arr)) + self.assertEqual(result, 'categorical') + + +class TestConvert(tm.TestCase): + + def test_convert_objects(self): + arr = np.array(['a', 'b', np.nan, np.nan, 'd', 'e', 'f'], dtype='O') + result = lib.maybe_convert_objects(arr) + self.assertTrue(result.dtype == np.object_) + + def test_convert_objects_ints(self): + # test that we can detect many kinds of integers + dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'] + + for dtype_str in dtypes: + arr = np.array(list(np.arange(20, dtype=dtype_str)), dtype='O') + self.assertTrue(arr[0].dtype == np.dtype(dtype_str)) + result = lib.maybe_convert_objects(arr) + self.assertTrue(issubclass(result.dtype.type, np.integer)) + + def test_convert_objects_complex_number(self): + for dtype in np.sctypes['complex']: + arr = np.array(list(1j * np.arange(20, dtype=dtype)), dtype='O') + self.assertTrue(arr[0].dtype == np.dtype(dtype)) + result = lib.maybe_convert_objects(arr) + self.assertTrue(issubclass(result.dtype.type, np.complexfloating)) + + +class Testisscalar(tm.TestCase): + + def test_isscalar_builtin_scalars(self): + self.assertTrue(lib.isscalar(None)) + self.assertTrue(lib.isscalar(True)) + self.assertTrue(lib.isscalar(False)) + self.assertTrue(lib.isscalar(0.)) + self.assertTrue(lib.isscalar(np.nan)) + self.assertTrue(lib.isscalar('foobar')) + self.assertTrue(lib.isscalar(b'foobar')) + self.assertTrue(lib.isscalar(u('efoobar'))) + self.assertTrue(lib.isscalar(datetime(2014, 1, 1))) + self.assertTrue(lib.isscalar(date(2014, 1, 1))) + self.assertTrue(lib.isscalar(time(12, 0))) + self.assertTrue(lib.isscalar(timedelta(hours=1))) + self.assertTrue(lib.isscalar(pd.NaT)) + + def test_isscalar_builtin_nonscalars(self): + self.assertFalse(lib.isscalar({})) + self.assertFalse(lib.isscalar([])) + self.assertFalse(lib.isscalar([1])) + self.assertFalse(lib.isscalar(())) + self.assertFalse(lib.isscalar((1, ))) + self.assertFalse(lib.isscalar(slice(None))) + self.assertFalse(lib.isscalar(Ellipsis)) + + def test_isscalar_numpy_array_scalars(self): + self.assertTrue(lib.isscalar(np.int64(1))) + self.assertTrue(lib.isscalar(np.float64(1.))) + self.assertTrue(lib.isscalar(np.int32(1))) + self.assertTrue(lib.isscalar(np.object_('foobar'))) + self.assertTrue(lib.isscalar(np.str_('foobar'))) + self.assertTrue(lib.isscalar(np.unicode_(u('foobar')))) + self.assertTrue(lib.isscalar(np.bytes_(b'foobar'))) + self.assertTrue(lib.isscalar(np.datetime64('2014-01-01'))) + self.assertTrue(lib.isscalar(np.timedelta64(1, 'h'))) + + def test_isscalar_numpy_zerodim_arrays(self): + for zerodim in [np.array(1), np.array('foobar'), + np.array(np.datetime64('2014-01-01')), + np.array(np.timedelta64(1, 'h')), + np.array(np.datetime64('NaT'))]: + self.assertFalse(lib.isscalar(zerodim)) + self.assertTrue(lib.isscalar(lib.item_from_zerodim(zerodim))) + + def test_isscalar_numpy_arrays(self): + self.assertFalse(lib.isscalar(np.array([]))) + self.assertFalse(lib.isscalar(np.array([[]]))) + self.assertFalse(lib.isscalar(np.matrix('1; 2'))) + + def test_isscalar_pandas_scalars(self): + self.assertTrue(lib.isscalar(pd.Timestamp('2014-01-01'))) + self.assertTrue(lib.isscalar(pd.Timedelta(hours=1))) + self.assertTrue(lib.isscalar(pd.Period('2014-01-01'))) + + def test_lisscalar_pandas_containers(self): + self.assertFalse(lib.isscalar(pd.Series())) + self.assertFalse(lib.isscalar(pd.Series([1]))) + self.assertFalse(lib.isscalar(pd.DataFrame())) + self.assertFalse(lib.isscalar(pd.DataFrame([[1]]))) + self.assertFalse(lib.isscalar(pd.Panel())) + self.assertFalse(lib.isscalar(pd.Panel([[[1]]]))) + self.assertFalse(lib.isscalar(pd.Index([]))) + self.assertFalse(lib.isscalar(pd.Index([1]))) + + +class TestParseSQL(tm.TestCase): + + def test_convert_sql_column_floats(self): + arr = np.array([1.5, None, 3, 4.2], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_strings(self): + arr = np.array(['1.5', None, '3', '4.2'], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array(['1.5', np.nan, '3', '4.2'], dtype=object) + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_unicode(self): + arr = np.array([u('1.5'), None, u('3'), u('4.2')], + dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array([u('1.5'), np.nan, u('3'), u('4.2')], + dtype=object) + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_ints(self): + arr = np.array([1, 2, 3, 4], dtype='O') + arr2 = np.array([1, 2, 3, 4], dtype='i4').astype('O') + result = lib.convert_sql_column(arr) + result2 = lib.convert_sql_column(arr2) + expected = np.array([1, 2, 3, 4], dtype='i8') + self.assert_numpy_array_equal(result, expected) + self.assert_numpy_array_equal(result2, expected) + + arr = np.array([1, 2, 3, None, 4], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_longs(self): + arr = np.array([long(1), long(2), long(3), long(4)], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, 4], dtype='i8') + self.assert_numpy_array_equal(result, expected) + + arr = np.array([long(1), long(2), long(3), None, long(4)], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_bools(self): + arr = np.array([True, False, True, False], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([True, False, True, False], dtype=bool) + self.assert_numpy_array_equal(result, expected) + + arr = np.array([True, False, None, False], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([True, False, np.nan, False], dtype=object) + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_decimals(self): + from decimal import Decimal + arr = np.array([Decimal('1.5'), None, Decimal('3'), Decimal('4.2')]) + result = lib.convert_sql_column(arr) + expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') + self.assert_numpy_array_equal(result, expected) + + def test_convert_downcast_int64(self): + from pandas.parser import na_values + + arr = np.array([1, 2, 7, 8, 10], dtype=np.int64) + expected = np.array([1, 2, 7, 8, 10], dtype=np.int8) + + # default argument + result = lib.downcast_int64(arr, na_values) + self.assert_numpy_array_equal(result, expected) + + result = lib.downcast_int64(arr, na_values, use_unsigned=False) + self.assert_numpy_array_equal(result, expected) + + expected = np.array([1, 2, 7, 8, 10], dtype=np.uint8) + result = lib.downcast_int64(arr, na_values, use_unsigned=True) + self.assert_numpy_array_equal(result, expected) + + # still cast to int8 despite use_unsigned=True + # because of the negative number as an element + arr = np.array([1, 2, -7, 8, 10], dtype=np.int64) + expected = np.array([1, 2, -7, 8, 10], dtype=np.int8) + result = lib.downcast_int64(arr, na_values, use_unsigned=True) + self.assert_numpy_array_equal(result, expected) + + arr = np.array([1, 2, 7, 8, 300], dtype=np.int64) + expected = np.array([1, 2, 7, 8, 300], dtype=np.int16) + result = lib.downcast_int64(arr, na_values) + self.assert_numpy_array_equal(result, expected) + + int8_na = na_values[np.int8] + int64_na = na_values[np.int64] + arr = np.array([int64_na, 2, 3, 10, 15], dtype=np.int64) + expected = np.array([int8_na, 2, 3, 10, 15], dtype=np.int8) + result = lib.downcast_int64(arr, na_values) + self.assert_numpy_array_equal(result, expected) + +if __name__ == '__main__': + import nose + + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index bf9574f48913a..6a97f195abba7 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -17,15 +17,19 @@ import pandas.core.algorithms as algos import pandas.util.testing as tm import pandas as pd +from pandas import lib from pandas.util.testing import (assert_almost_equal, assert_frame_equal, randn, assert_series_equal) from pandas.compat import zip, u def assert_block_equal(left, right): - assert_almost_equal(left.values, right.values) + tm.assert_numpy_array_equal(left.values, right.values) assert (left.dtype == right.dtype) - assert_almost_equal(left.mgr_locs, right.mgr_locs) + tm.assertIsInstance(left.mgr_locs, lib.BlockPlacement) + tm.assertIsInstance(right.mgr_locs, lib.BlockPlacement) + tm.assert_numpy_array_equal(left.mgr_locs.as_array, + right.mgr_locs.as_array) def get_numeric_mat(shape): @@ -207,7 +211,9 @@ def _check(blk): _check(self.bool_block) def test_mgr_locs(self): - assert_almost_equal(self.fblock.mgr_locs, [0, 2, 4]) + tm.assertIsInstance(self.fblock.mgr_locs, lib.BlockPlacement) + tm.assert_numpy_array_equal(self.fblock.mgr_locs.as_array, + np.array([0, 2, 4], dtype=np.int64)) def test_attrs(self): self.assertEqual(self.fblock.shape, self.fblock.values.shape) @@ -223,9 +229,10 @@ def test_merge(self): ablock = make_block(avals, ref_cols.get_indexer(['e', 'b'])) bblock = make_block(bvals, ref_cols.get_indexer(['a', 'd'])) merged = ablock.merge(bblock) - assert_almost_equal(merged.mgr_locs, [0, 1, 2, 3]) - assert_almost_equal(merged.values[[0, 2]], avals) - assert_almost_equal(merged.values[[1, 3]], bvals) + tm.assert_numpy_array_equal(merged.mgr_locs.as_array, + np.array([0, 1, 2, 3], dtype=np.int64)) + tm.assert_numpy_array_equal(merged.values[[0, 2]], np.array(avals)) + tm.assert_numpy_array_equal(merged.values[[1, 3]], np.array(bvals)) # TODO: merge with mixed type? @@ -246,17 +253,22 @@ def test_insert(self): def test_delete(self): newb = self.fblock.copy() newb.delete(0) - assert_almost_equal(newb.mgr_locs, [2, 4]) + tm.assertIsInstance(newb.mgr_locs, lib.BlockPlacement) + tm.assert_numpy_array_equal(newb.mgr_locs.as_array, + np.array([2, 4], dtype=np.int64)) self.assertTrue((newb.values[0] == 1).all()) newb = self.fblock.copy() newb.delete(1) - assert_almost_equal(newb.mgr_locs, [0, 4]) + tm.assertIsInstance(newb.mgr_locs, lib.BlockPlacement) + tm.assert_numpy_array_equal(newb.mgr_locs.as_array, + np.array([0, 4], dtype=np.int64)) self.assertTrue((newb.values[1] == 2).all()) newb = self.fblock.copy() newb.delete(2) - assert_almost_equal(newb.mgr_locs, [0, 2]) + tm.assert_numpy_array_equal(newb.mgr_locs.as_array, + np.array([0, 2], dtype=np.int64)) self.assertTrue((newb.values[1] == 1).all()) newb = self.fblock.copy() @@ -399,9 +411,9 @@ def test_get_scalar(self): for i, index in enumerate(self.mgr.axes[1]): res = self.mgr.get_scalar((item, index)) exp = self.mgr.get(item, fastpath=False)[i] - assert_almost_equal(res, exp) + self.assertEqual(res, exp) exp = self.mgr.get(item).internal_values()[i] - assert_almost_equal(res, exp) + self.assertEqual(res, exp) def test_get(self): cols = Index(list('abc')) @@ -421,10 +433,14 @@ def test_set(self): mgr.set('d', np.array(['foo'] * 3)) mgr.set('b', np.array(['bar'] * 3)) - assert_almost_equal(mgr.get('a').internal_values(), [0] * 3) - assert_almost_equal(mgr.get('b').internal_values(), ['bar'] * 3) - assert_almost_equal(mgr.get('c').internal_values(), [2] * 3) - assert_almost_equal(mgr.get('d').internal_values(), ['foo'] * 3) + tm.assert_numpy_array_equal(mgr.get('a').internal_values(), + np.array([0] * 3)) + tm.assert_numpy_array_equal(mgr.get('b').internal_values(), + np.array(['bar'] * 3, dtype=np.object_)) + tm.assert_numpy_array_equal(mgr.get('c').internal_values(), + np.array([2] * 3)) + tm.assert_numpy_array_equal(mgr.get('d').internal_values(), + np.array(['foo'] * 3, dtype=np.object_)) def test_insert(self): self.mgr.insert(0, 'inserted', np.arange(N)) @@ -689,8 +705,9 @@ def test_consolidate_ordering_issues(self): self.assertEqual(cons.nblocks, 4) cons = self.mgr.consolidate().get_numeric_data() self.assertEqual(cons.nblocks, 1) - assert_almost_equal(cons.blocks[0].mgr_locs, - np.arange(len(cons.items))) + tm.assertIsInstance(cons.blocks[0].mgr_locs, lib.BlockPlacement) + tm.assert_numpy_array_equal(cons.blocks[0].mgr_locs.as_array, + np.arange(len(cons.items), dtype=np.int64)) def test_reindex_index(self): pass @@ -786,18 +803,18 @@ def test_get_bool_data(self): bools.get('bool').internal_values()) bools.set('bool', np.array([True, False, True])) - assert_almost_equal( - mgr.get('bool', fastpath=False), [True, False, True]) - assert_almost_equal( - mgr.get('bool').internal_values(), [True, False, True]) + tm.assert_numpy_array_equal(mgr.get('bool', fastpath=False), + np.array([True, False, True])) + tm.assert_numpy_array_equal(mgr.get('bool').internal_values(), + np.array([True, False, True])) # Check sharing bools2 = mgr.get_bool_data(copy=True) bools2.set('bool', np.array([False, True, False])) - assert_almost_equal( - mgr.get('bool', fastpath=False), [True, False, True]) - assert_almost_equal( - mgr.get('bool').internal_values(), [True, False, True]) + tm.assert_numpy_array_equal(mgr.get('bool', fastpath=False), + np.array([True, False, True])) + tm.assert_numpy_array_equal(mgr.get('bool').internal_values(), + np.array([True, False, True])) def test_unicode_repr_doesnt_raise(self): repr(create_mgr(u('b,\u05d0: object'))) @@ -892,8 +909,7 @@ def assert_slice_ok(mgr, axis, slobj): mat_slobj = (slice(None), ) * axis + (slobj, ) tm.assert_numpy_array_equal(mat[mat_slobj], sliced.as_matrix(), check_dtype=False) - tm.assert_numpy_array_equal(mgr.axes[axis][slobj], - sliced.axes[axis]) + tm.assert_index_equal(mgr.axes[axis][slobj], sliced.axes[axis]) for mgr in self.MANAGERS: for ax in range(mgr.ndim): @@ -931,8 +947,8 @@ def assert_take_ok(mgr, axis, indexer): taken = mgr.take(indexer, axis) tm.assert_numpy_array_equal(np.take(mat, indexer, axis), taken.as_matrix(), check_dtype=False) - tm.assert_numpy_array_equal(mgr.axes[axis].take(indexer), - taken.axes[axis]) + tm.assert_index_equal(mgr.axes[axis].take(indexer), + taken.axes[axis]) for mgr in self.MANAGERS: for ax in range(mgr.ndim): diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index 6912e3a7ff68c..10a6bb5c75b01 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -1,19 +1,9 @@ # -*- coding: utf-8 -*- -from datetime import datetime, timedelta, date, time - import numpy as np -import pandas as pd import pandas.lib as lib import pandas.util.testing as tm -from pandas.compat import long, u, PY2 - - -def _assert_same_values_and_dtype(res, exp): - tm.assert_equal(res.dtype, exp.dtype) - tm.assert_almost_equal(res, exp) - class TestMisc(tm.TestCase): @@ -34,16 +24,21 @@ def test_max_len_string_array(self): tm.assertRaises(TypeError, lambda: lib.max_len_string_array(arr.astype('U'))) - def test_infer_dtype_bytes(self): - compare = 'string' if PY2 else 'bytes' + def test_fast_unique_multiple_list_gen_sort(self): + keys = [['p', 'a'], ['n', 'd'], ['a', 's']] + + gen = (key for key in keys) + expected = np.array(['a', 'd', 'n', 'p', 's']) + out = lib.fast_unique_multiple_list_gen(gen, sort=True) + tm.assert_numpy_array_equal(np.array(out), expected) - # string array of bytes - arr = np.array(list('abc'), dtype='S1') - self.assertEqual(pd.lib.infer_dtype(arr), compare) + gen = (key for key in keys) + expected = np.array(['p', 'a', 'n', 'd', 's']) + out = lib.fast_unique_multiple_list_gen(gen, sort=False) + tm.assert_numpy_array_equal(np.array(out), expected) - # object array of bytes - arr = arr.astype(object) - self.assertEqual(pd.lib.infer_dtype(arr), compare) + +class TestIndexing(tm.TestCase): def test_maybe_indices_to_slice_left_edge(self): target = np.arange(100) @@ -174,151 +169,58 @@ def test_maybe_indices_to_slice_middle(self): self.assert_numpy_array_equal(maybe_slice, indices) self.assert_numpy_array_equal(target[indices], target[maybe_slice]) - def test_isinf_scalar(self): - # GH 11352 - self.assertTrue(lib.isposinf_scalar(float('inf'))) - self.assertTrue(lib.isposinf_scalar(np.inf)) - self.assertFalse(lib.isposinf_scalar(-np.inf)) - self.assertFalse(lib.isposinf_scalar(1)) - self.assertFalse(lib.isposinf_scalar('a')) - - self.assertTrue(lib.isneginf_scalar(float('-inf'))) - self.assertTrue(lib.isneginf_scalar(-np.inf)) - self.assertFalse(lib.isneginf_scalar(np.inf)) - self.assertFalse(lib.isneginf_scalar(1)) - self.assertFalse(lib.isneginf_scalar('a')) - - -class Testisscalar(tm.TestCase): - - def test_isscalar_builtin_scalars(self): - self.assertTrue(lib.isscalar(None)) - self.assertTrue(lib.isscalar(True)) - self.assertTrue(lib.isscalar(False)) - self.assertTrue(lib.isscalar(0.)) - self.assertTrue(lib.isscalar(np.nan)) - self.assertTrue(lib.isscalar('foobar')) - self.assertTrue(lib.isscalar(b'foobar')) - self.assertTrue(lib.isscalar(u('efoobar'))) - self.assertTrue(lib.isscalar(datetime(2014, 1, 1))) - self.assertTrue(lib.isscalar(date(2014, 1, 1))) - self.assertTrue(lib.isscalar(time(12, 0))) - self.assertTrue(lib.isscalar(timedelta(hours=1))) - self.assertTrue(lib.isscalar(pd.NaT)) - - def test_isscalar_builtin_nonscalars(self): - self.assertFalse(lib.isscalar({})) - self.assertFalse(lib.isscalar([])) - self.assertFalse(lib.isscalar([1])) - self.assertFalse(lib.isscalar(())) - self.assertFalse(lib.isscalar((1, ))) - self.assertFalse(lib.isscalar(slice(None))) - self.assertFalse(lib.isscalar(Ellipsis)) - - def test_isscalar_numpy_array_scalars(self): - self.assertTrue(lib.isscalar(np.int64(1))) - self.assertTrue(lib.isscalar(np.float64(1.))) - self.assertTrue(lib.isscalar(np.int32(1))) - self.assertTrue(lib.isscalar(np.object_('foobar'))) - self.assertTrue(lib.isscalar(np.str_('foobar'))) - self.assertTrue(lib.isscalar(np.unicode_(u('foobar')))) - self.assertTrue(lib.isscalar(np.bytes_(b'foobar'))) - self.assertTrue(lib.isscalar(np.datetime64('2014-01-01'))) - self.assertTrue(lib.isscalar(np.timedelta64(1, 'h'))) - - def test_isscalar_numpy_zerodim_arrays(self): - for zerodim in [np.array(1), np.array('foobar'), - np.array(np.datetime64('2014-01-01')), - np.array(np.timedelta64(1, 'h')), - np.array(np.datetime64('NaT'))]: - self.assertFalse(lib.isscalar(zerodim)) - self.assertTrue(lib.isscalar(lib.item_from_zerodim(zerodim))) - - def test_isscalar_numpy_arrays(self): - self.assertFalse(lib.isscalar(np.array([]))) - self.assertFalse(lib.isscalar(np.array([[]]))) - self.assertFalse(lib.isscalar(np.matrix('1; 2'))) - - def test_isscalar_pandas_scalars(self): - self.assertTrue(lib.isscalar(pd.Timestamp('2014-01-01'))) - self.assertTrue(lib.isscalar(pd.Timedelta(hours=1))) - self.assertTrue(lib.isscalar(pd.Period('2014-01-01'))) - - def test_lisscalar_pandas_containers(self): - self.assertFalse(lib.isscalar(pd.Series())) - self.assertFalse(lib.isscalar(pd.Series([1]))) - self.assertFalse(lib.isscalar(pd.DataFrame())) - self.assertFalse(lib.isscalar(pd.DataFrame([[1]]))) - self.assertFalse(lib.isscalar(pd.Panel())) - self.assertFalse(lib.isscalar(pd.Panel([[[1]]]))) - self.assertFalse(lib.isscalar(pd.Index([]))) - self.assertFalse(lib.isscalar(pd.Index([1]))) - - -class TestParseSQL(tm.TestCase): - - def test_convert_sql_column_floats(self): - arr = np.array([1.5, None, 3, 4.2], dtype=object) - result = lib.convert_sql_column(arr) - expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') - _assert_same_values_and_dtype(result, expected) - - def test_convert_sql_column_strings(self): - arr = np.array(['1.5', None, '3', '4.2'], dtype=object) - result = lib.convert_sql_column(arr) - expected = np.array(['1.5', np.nan, '3', '4.2'], dtype=object) - _assert_same_values_and_dtype(result, expected) - - def test_convert_sql_column_unicode(self): - arr = np.array([u('1.5'), None, u('3'), u('4.2')], - dtype=object) - result = lib.convert_sql_column(arr) - expected = np.array([u('1.5'), np.nan, u('3'), u('4.2')], - dtype=object) - _assert_same_values_and_dtype(result, expected) - - def test_convert_sql_column_ints(self): - arr = np.array([1, 2, 3, 4], dtype='O') - arr2 = np.array([1, 2, 3, 4], dtype='i4').astype('O') - result = lib.convert_sql_column(arr) - result2 = lib.convert_sql_column(arr2) - expected = np.array([1, 2, 3, 4], dtype='i8') - _assert_same_values_and_dtype(result, expected) - _assert_same_values_and_dtype(result2, expected) - - arr = np.array([1, 2, 3, None, 4], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') - _assert_same_values_and_dtype(result, expected) - - def test_convert_sql_column_longs(self): - arr = np.array([long(1), long(2), long(3), long(4)], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([1, 2, 3, 4], dtype='i8') - _assert_same_values_and_dtype(result, expected) - - arr = np.array([long(1), long(2), long(3), None, long(4)], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') - _assert_same_values_and_dtype(result, expected) - - def test_convert_sql_column_bools(self): - arr = np.array([True, False, True, False], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([True, False, True, False], dtype=bool) - _assert_same_values_and_dtype(result, expected) - - arr = np.array([True, False, None, False], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([True, False, np.nan, False], dtype=object) - _assert_same_values_and_dtype(result, expected) - - def test_convert_sql_column_decimals(self): - from decimal import Decimal - arr = np.array([Decimal('1.5'), None, Decimal('3'), Decimal('4.2')]) - result = lib.convert_sql_column(arr) - expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') - _assert_same_values_and_dtype(result, expected) + def test_maybe_booleans_to_slice(self): + arr = np.array([0, 0, 1, 1, 1, 0, 1], dtype=np.uint8) + result = lib.maybe_booleans_to_slice(arr) + self.assertTrue(result.dtype == np.bool_) + + result = lib.maybe_booleans_to_slice(arr[:0]) + self.assertTrue(result == slice(0, 0)) + + def test_get_reverse_indexer(self): + indexer = np.array([-1, -1, 1, 2, 0, -1, 3, 4], dtype=np.int64) + result = lib.get_reverse_indexer(indexer, 5) + expected = np.array([4, 2, 3, 6, 7], dtype=np.int64) + self.assertTrue(np.array_equal(result, expected)) + + +def test_duplicated_with_nas(): + keys = np.array([0, 1, np.nan, 0, 2, np.nan], dtype=object) + + result = lib.duplicated(keys) + expected = [False, False, False, True, False, True] + assert (np.array_equal(result, expected)) + + result = lib.duplicated(keys, keep='first') + expected = [False, False, False, True, False, True] + assert (np.array_equal(result, expected)) + + result = lib.duplicated(keys, keep='last') + expected = [True, False, True, False, False, False] + assert (np.array_equal(result, expected)) + + result = lib.duplicated(keys, keep=False) + expected = [True, False, True, True, False, True] + assert (np.array_equal(result, expected)) + + keys = np.empty(8, dtype=object) + for i, t in enumerate(zip([0, 0, np.nan, np.nan] * 2, + [0, np.nan, 0, np.nan] * 2)): + keys[i] = t + + result = lib.duplicated(keys) + falses = [False] * 4 + trues = [True] * 4 + expected = falses + trues + assert (np.array_equal(result, expected)) + + result = lib.duplicated(keys, keep='last') + expected = trues + falses + assert (np.array_equal(result, expected)) + + result = lib.duplicated(keys, keep=False) + expected = trues + trues + assert (np.array_equal(result, expected)) if __name__ == '__main__': import nose diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 63a8b49ab4b00..c4ccef13f2844 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -87,19 +87,19 @@ def test_append_index(self): (1.2, datetime.datetime(2011, 1, 2, tzinfo=tz)), (1.3, datetime.datetime(2011, 1, 3, tzinfo=tz))] expected = Index([1.1, 1.2, 1.3] + expected_tuples) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) result = midx_lv2.append(idx1) expected = Index(expected_tuples + [1.1, 1.2, 1.3]) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) result = midx_lv2.append(midx_lv2) - expected = MultiIndex.from_arrays([idx1.append(idx1), idx2.append(idx2) - ]) - self.assertTrue(result.equals(expected)) + expected = MultiIndex.from_arrays([idx1.append(idx1), + idx2.append(idx2)]) + self.assert_index_equal(result, expected) result = midx_lv2.append(midx_lv3) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) result = midx_lv3.append(midx_lv2) expected = Index._simple_new( @@ -107,7 +107,7 @@ def test_append_index(self): (1.2, datetime.datetime(2011, 1, 2, tzinfo=tz), 'B'), (1.3, datetime.datetime(2011, 1, 3, tzinfo=tz), 'C')] + expected_tuples), None) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) def test_dataframe_constructor(self): multi = DataFrame(np.random.randn(4, 4), @@ -966,9 +966,7 @@ def check(left, right): assert_series_equal(left, right) self.assertFalse(left.index.is_unique) li, ri = left.index, right.index - for i in range(ri.nlevels): - tm.assert_numpy_array_equal(li.levels[i], ri.levels[i]) - tm.assert_numpy_array_equal(li.labels[i], ri.labels[i]) + tm.assert_index_equal(li, ri) df = DataFrame(np.arange(12).reshape(4, 3), index=list('abab'), @@ -1542,8 +1540,8 @@ def aggf(x): # for good measure, groupby detail level_index = frame._get_axis(axis).levels[level] - self.assertTrue(leftside._get_axis(axis).equals(level_index)) - self.assertTrue(rightside._get_axis(axis).equals(level_index)) + self.assert_index_equal(leftside._get_axis(axis), level_index) + self.assert_index_equal(rightside._get_axis(axis), level_index) assert_frame_equal(leftside, rightside) @@ -2211,12 +2209,11 @@ def test_datetimeindex(self): tz='US/Eastern') idx = MultiIndex.from_arrays([idx1, idx2]) - expected1 = pd.DatetimeIndex( - ['2013-04-01 9:00', '2013-04-02 9:00', '2013-04-03 9:00' - ], tz='Asia/Tokyo') + expected1 = pd.DatetimeIndex(['2013-04-01 9:00', '2013-04-02 9:00', + '2013-04-03 9:00'], tz='Asia/Tokyo') - self.assertTrue(idx.levels[0].equals(expected1)) - self.assertTrue(idx.levels[1].equals(idx2)) + self.assert_index_equal(idx.levels[0], expected1) + self.assert_index_equal(idx.levels[1], idx2) # from datetime combos # GH 7888 @@ -2256,18 +2253,20 @@ def test_set_index_datetime(self): df.index = pd.to_datetime(df.pop('datetime'), utc=True) df.index = df.index.tz_localize('UTC').tz_convert('US/Pacific') - expected = pd.DatetimeIndex( - ['2011-07-19 07:00:00', '2011-07-19 08:00:00', - '2011-07-19 09:00:00']) + expected = pd.DatetimeIndex(['2011-07-19 07:00:00', + '2011-07-19 08:00:00', + '2011-07-19 09:00:00'], name='datetime') expected = expected.tz_localize('UTC').tz_convert('US/Pacific') df = df.set_index('label', append=True) - self.assertTrue(df.index.levels[0].equals(expected)) - self.assertTrue(df.index.levels[1].equals(pd.Index(['a', 'b']))) + self.assert_index_equal(df.index.levels[0], expected) + self.assert_index_equal(df.index.levels[1], + pd.Index(['a', 'b'], name='label')) df = df.swaplevel(0, 1) - self.assertTrue(df.index.levels[0].equals(pd.Index(['a', 'b']))) - self.assertTrue(df.index.levels[1].equals(expected)) + self.assert_index_equal(df.index.levels[0], + pd.Index(['a', 'b'], name='label')) + self.assert_index_equal(df.index.levels[1], expected) df = DataFrame(np.random.random(6)) idx1 = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00', @@ -2287,17 +2286,17 @@ def test_set_index_datetime(self): expected1 = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00'], tz='US/Eastern') - expected2 = pd.DatetimeIndex( - ['2012-04-01 09:00', '2012-04-02 09:00'], tz='US/Eastern') + expected2 = pd.DatetimeIndex(['2012-04-01 09:00', '2012-04-02 09:00'], + tz='US/Eastern') - self.assertTrue(df.index.levels[0].equals(expected1)) - self.assertTrue(df.index.levels[1].equals(expected2)) - self.assertTrue(df.index.levels[2].equals(idx3)) + self.assert_index_equal(df.index.levels[0], expected1) + self.assert_index_equal(df.index.levels[1], expected2) + self.assert_index_equal(df.index.levels[2], idx3) # GH 7092 - self.assertTrue(df.index.get_level_values(0).equals(idx1)) - self.assertTrue(df.index.get_level_values(1).equals(idx2)) - self.assertTrue(df.index.get_level_values(2).equals(idx3)) + self.assert_index_equal(df.index.get_level_values(0), idx1) + self.assert_index_equal(df.index.get_level_values(1), idx2) + self.assert_index_equal(df.index.get_level_values(2), idx3) def test_reset_index_datetime(self): # GH 3950 @@ -2404,13 +2403,13 @@ def test_set_index_period(self): expected1 = pd.period_range('2011-01-01', periods=3, freq='M') expected2 = pd.period_range('2013-01-01 09:00', periods=2, freq='H') - self.assertTrue(df.index.levels[0].equals(expected1)) - self.assertTrue(df.index.levels[1].equals(expected2)) - self.assertTrue(df.index.levels[2].equals(idx3)) + self.assert_index_equal(df.index.levels[0], expected1) + self.assert_index_equal(df.index.levels[1], expected2) + self.assert_index_equal(df.index.levels[2], idx3) - self.assertTrue(df.index.get_level_values(0).equals(idx1)) - self.assertTrue(df.index.get_level_values(1).equals(idx2)) - self.assertTrue(df.index.get_level_values(2).equals(idx3)) + self.assert_index_equal(df.index.get_level_values(0), idx1) + self.assert_index_equal(df.index.get_level_values(1), idx2) + self.assert_index_equal(df.index.get_level_values(2), idx3) def test_repeat(self): # GH 9361 diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index d33a64002c3b1..904bedde03312 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -799,30 +799,31 @@ def setUp(self): def test_nanvar_all_finite(self): samples = self.samples actual_variance = nanops.nanvar(samples) - np.testing.assert_almost_equal(actual_variance, self.variance, - decimal=2) + tm.assert_almost_equal(actual_variance, self.variance, + check_less_precise=2) def test_nanvar_nans(self): samples = np.nan * np.ones(2 * self.samples.shape[0]) samples[::2] = self.samples actual_variance = nanops.nanvar(samples, skipna=True) - np.testing.assert_almost_equal(actual_variance, self.variance, - decimal=2) + tm.assert_almost_equal(actual_variance, self.variance, + check_less_precise=2) actual_variance = nanops.nanvar(samples, skipna=False) - np.testing.assert_almost_equal(actual_variance, np.nan, decimal=2) + tm.assert_almost_equal(actual_variance, np.nan, check_less_precise=2) def test_nanstd_nans(self): samples = np.nan * np.ones(2 * self.samples.shape[0]) samples[::2] = self.samples actual_std = nanops.nanstd(samples, skipna=True) - np.testing.assert_almost_equal(actual_std, self.variance ** 0.5, - decimal=2) + tm.assert_almost_equal(actual_std, self.variance ** 0.5, + check_less_precise=2) actual_std = nanops.nanvar(samples, skipna=False) - np.testing.assert_almost_equal(actual_std, np.nan, decimal=2) + tm.assert_almost_equal(actual_std, np.nan, + check_less_precise=2) def test_nanvar_axis(self): # Generate some sample data. @@ -831,8 +832,8 @@ def test_nanvar_axis(self): samples = np.vstack([samples_norm, samples_unif]) actual_variance = nanops.nanvar(samples, axis=1) - np.testing.assert_array_almost_equal(actual_variance, np.array( - [self.variance, 1.0 / 12]), decimal=2) + tm.assert_almost_equal(actual_variance, np.array( + [self.variance, 1.0 / 12]), check_less_precise=2) def test_nanvar_ddof(self): n = 5 @@ -845,13 +846,16 @@ def test_nanvar_ddof(self): # The unbiased estimate. var = 1.0 / 12 - np.testing.assert_almost_equal(variance_1, var, decimal=2) + tm.assert_almost_equal(variance_1, var, + check_less_precise=2) + # The underestimated variance. - np.testing.assert_almost_equal(variance_0, (n - 1.0) / n * var, - decimal=2) + tm.assert_almost_equal(variance_0, (n - 1.0) / n * var, + check_less_precise=2) + # The overestimated variance. - np.testing.assert_almost_equal(variance_2, (n - 1.0) / (n - 2.0) * var, - decimal=2) + tm.assert_almost_equal(variance_2, (n - 1.0) / (n - 2.0) * var, + check_less_precise=2) def test_ground_truth(self): # Test against values that were precomputed with Numpy. @@ -873,17 +877,15 @@ def test_ground_truth(self): for axis in range(2): for ddof in range(3): var = nanops.nanvar(samples, skipna=True, axis=axis, ddof=ddof) - np.testing.assert_array_almost_equal(var[:3], - variance[axis, ddof]) - np.testing.assert_equal(var[3], np.nan) + tm.assert_almost_equal(var[:3], variance[axis, ddof]) + self.assertTrue(np.isnan(var[3])) # Test nanstd. for axis in range(2): for ddof in range(3): std = nanops.nanstd(samples, skipna=True, axis=axis, ddof=ddof) - np.testing.assert_array_almost_equal( - std[:3], variance[axis, ddof] ** 0.5) - np.testing.assert_equal(std[3], np.nan) + tm.assert_almost_equal(std[:3], variance[axis, ddof] ** 0.5) + self.assertTrue(np.isnan(std[3])) def test_nanstd_roundoff(self): # Regression test for GH 10242 (test data taken from GH 10489). Ensure @@ -931,7 +933,7 @@ def test_axis(self): samples = np.vstack([self.samples, np.nan * np.ones(len(self.samples))]) skew = nanops.nanskew(samples, axis=1) - tm.assert_almost_equal(skew, [self.actual_skew, np.nan]) + tm.assert_almost_equal(skew, np.array([self.actual_skew, np.nan])) def test_nans(self): samples = np.hstack([self.samples, np.nan]) @@ -981,7 +983,7 @@ def test_axis(self): samples = np.vstack([self.samples, np.nan * np.ones(len(self.samples))]) kurt = nanops.nankurt(samples, axis=1) - tm.assert_almost_equal(kurt, [self.actual_kurt, np.nan]) + tm.assert_almost_equal(kurt, np.array([self.actual_kurt, np.nan])) def test_nans(self): samples = np.hstack([self.samples, np.nan]) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 87401f272adbd..b1f09ad2685e3 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -1086,12 +1086,12 @@ def test_ctor_dict(self): # TODO: unused? wp3 = Panel.from_dict(d3) # noqa - self.assertTrue(wp.major_axis.equals(self.panel.major_axis)) + self.assert_index_equal(wp.major_axis, self.panel.major_axis) assert_panel_equal(wp, wp2) # intersect wp = Panel.from_dict(d, intersect=True) - self.assertTrue(wp.major_axis.equals(itemb.index[5:])) + self.assert_index_equal(wp.major_axis, itemb.index[5:]) # use constructor assert_panel_equal(Panel(d), Panel.from_dict(d)) @@ -1123,7 +1123,7 @@ def test_constructor_dict_mixed(self): data = dict((k, v.values) for k, v in self.panel.iteritems()) result = Panel(data) exp_major = Index(np.arange(len(self.panel.major_axis))) - self.assertTrue(result.major_axis.equals(exp_major)) + self.assert_index_equal(result.major_axis, exp_major) result = Panel(data, items=self.panel.items, major_axis=self.panel.major_axis, @@ -1213,8 +1213,8 @@ def test_conform(self): df = self.panel['ItemA'][:-5].filter(items=['A', 'B']) conformed = self.panel.conform(df) - assert (conformed.index.equals(self.panel.major_axis)) - assert (conformed.columns.equals(self.panel.minor_axis)) + tm.assert_index_equal(conformed.index, self.panel.major_axis) + tm.assert_index_equal(conformed.columns, self.panel.minor_axis) def test_convert_objects(self): @@ -2078,11 +2078,11 @@ def test_rename(self): renamed = self.panel.rename_axis(mapper, axis=0) exp = Index(['foo', 'bar', 'baz']) - self.assertTrue(renamed.items.equals(exp)) + self.assert_index_equal(renamed.items, exp) renamed = self.panel.rename_axis(str.lower, axis=2) exp = Index(['a', 'b', 'c', 'd']) - self.assertTrue(renamed.minor_axis.equals(exp)) + self.assert_index_equal(renamed.minor_axis, exp) # don't copy renamed_nocopy = self.panel.rename_axis(mapper, axis=0, copy=False) @@ -2301,8 +2301,8 @@ def test_update_raise(self): [[1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3.]]]) - np.testing.assert_raises(Exception, pan.update, *(pan, ), - **{'raise_conflict': True}) + self.assertRaises(Exception, pan.update, *(pan, ), + **{'raise_conflict': True}) def test_all_any(self): self.assertTrue((self.panel.all(axis=0).values == nanall( @@ -2485,7 +2485,7 @@ def test_axis_dummies(self): transformed = make_axis_dummies(self.panel, 'minor', transform=mapping.get) self.assertEqual(len(transformed.columns), 2) - self.assert_numpy_array_equal(transformed.columns, ['one', 'two']) + self.assert_index_equal(transformed.columns, Index(['one', 'two'])) # TODO: test correctness @@ -2578,10 +2578,10 @@ def _monotonic(arr): def test_panel_index(): index = panelm.panel_index([1, 2, 3, 4], [1, 2, 3]) - expected = MultiIndex.from_arrays([np.tile( - [1, 2, 3, 4], 3), np.repeat( - [1, 2, 3], 4)]) - assert (index.equals(expected)) + expected = MultiIndex.from_arrays([np.tile([1, 2, 3, 4], 3), + np.repeat([1, 2, 3], 4)], + names=['time', 'panel']) + tm.assert_index_equal(index, expected) def test_import_warnings(): diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index e3e906d48ae98..607048df29faa 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -733,7 +733,7 @@ def test_constructor_dict_mixed(self): data = dict((k, v.values) for k, v in self.panel4d.iteritems()) result = Panel4D(data) exp_major = Index(np.arange(len(self.panel4d.major_axis))) - self.assertTrue(result.major_axis.equals(exp_major)) + self.assert_index_equal(result.major_axis, exp_major) result = Panel4D(data, labels=self.panel4d.labels, @@ -799,9 +799,9 @@ def test_conform(self): p = self.panel4d['l1'].filter(items=['ItemA', 'ItemB']) conformed = self.panel4d.conform(p) - assert(conformed.items.equals(self.panel4d.labels)) - assert(conformed.major_axis.equals(self.panel4d.major_axis)) - assert(conformed.minor_axis.equals(self.panel4d.minor_axis)) + tm.assert_index_equal(conformed.items, self.panel4d.labels) + tm.assert_index_equal(conformed.major_axis, self.panel4d.major_axis) + tm.assert_index_equal(conformed.minor_axis, self.panel4d.minor_axis) def test_reindex(self): ref = self.panel4d['l2'] @@ -1085,11 +1085,11 @@ def test_rename(self): renamed = self.panel4d.rename_axis(mapper, axis=0) exp = Index(['foo', 'bar', 'baz']) - self.assertTrue(renamed.labels.equals(exp)) + self.assert_index_equal(renamed.labels, exp) renamed = self.panel4d.rename_axis(str.lower, axis=3) exp = Index(['a', 'b', 'c', 'd']) - self.assertTrue(renamed.minor_axis.equals(exp)) + self.assert_index_equal(renamed.minor_axis, exp) # don't copy renamed_nocopy = self.panel4d.rename_axis(mapper, axis=0, copy=False) diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index 862e2282bae2f..7136d7effc1fc 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -239,26 +239,16 @@ def test_just_na(self): def test_include_na(self): s = ['a', 'b', np.nan] res = get_dummies(s, sparse=self.sparse) - exp = DataFrame({'a': {0: 1.0, - 1: 0.0, - 2: 0.0}, - 'b': {0: 0.0, - 1: 1.0, - 2: 0.0}}) + exp = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0}, + 'b': {0: 0.0, 1: 1.0, 2: 0.0}}) assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 res_na = get_dummies(s, dummy_na=True, sparse=self.sparse) - exp_na = DataFrame({nan: {0: 0.0, - 1: 0.0, - 2: 1.0}, - 'a': {0: 1.0, - 1: 0.0, - 2: 0.0}, - 'b': {0: 0.0, - 1: 1.0, - 2: 0.0}}).reindex_axis( - ['a', 'b', nan], 1) + exp_na = DataFrame({nan: {0: 0.0, 1: 0.0, 2: 1.0}, + 'a': {0: 1.0, 1: 0.0, 2: 0.0}, + 'b': {0: 0.0, 1: 1.0, 2: 0.0}}) + exp_na = exp_na.reindex_axis(['a', 'b', nan], 1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns assert_frame_equal(res_na, exp_na) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 4179949bc49a6..3d1851966afd0 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -48,12 +48,12 @@ def test_iter(self): # indices of each yielded Series should be equal to the index of # the original Series - tm.assert_numpy_array_equal(s.index, ds.index) + tm.assert_index_equal(s.index, ds.index) for el in s: # each element of the series is either a basestring/str or nan - self.assertTrue(isinstance(el, compat.string_types) or isnull( - el)) + self.assertTrue(isinstance(el, compat.string_types) or + isnull(el)) # desired behavior is to iterate until everything would be nan on the # next iter so make sure the last element of the iterator was 'l' in @@ -95,8 +95,8 @@ def test_iter_object_try_string(self): self.assertEqual(s, 'h') def test_cat(self): - one = ['a', 'a', 'b', 'b', 'c', NA] - two = ['a', NA, 'b', 'd', 'foo', NA] + one = np.array(['a', 'a', 'b', 'b', 'c', NA], dtype=np.object_) + two = np.array(['a', NA, 'b', 'd', 'foo', NA], dtype=np.object_) # single array result = strings.str_cat(one) @@ -121,21 +121,24 @@ def test_cat(self): # Multiple arrays result = strings.str_cat(one, [two], na_rep='NA') - exp = ['aa', 'aNA', 'bb', 'bd', 'cfoo', 'NANA'] + exp = np.array(['aa', 'aNA', 'bb', 'bd', 'cfoo', 'NANA'], + dtype=np.object_) self.assert_numpy_array_equal(result, exp) result = strings.str_cat(one, two) - exp = ['aa', NA, 'bb', 'bd', 'cfoo', NA] + exp = np.array(['aa', NA, 'bb', 'bd', 'cfoo', NA], dtype=np.object_) tm.assert_almost_equal(result, exp) def test_count(self): - values = ['foo', 'foofoo', NA, 'foooofooofommmfoo'] + values = np.array(['foo', 'foofoo', NA, 'foooofooofommmfoo'], + dtype=np.object_) result = strings.str_count(values, 'f[o]+') - exp = Series([1, 2, NA, 4]) - tm.assert_almost_equal(result, exp) + exp = np.array([1, 2, NA, 4]) + tm.assert_numpy_array_equal(result, exp) result = Series(values).str.count('f[o]+') + exp = Series([1, 2, NA, 4]) tm.assertIsInstance(result, Series) tm.assert_series_equal(result, exp) @@ -163,61 +166,66 @@ def test_count(self): tm.assert_series_equal(result, exp) def test_contains(self): - values = ['foo', NA, 'fooommm__foo', 'mmm_', 'foommm[_]+bar'] + values = np.array(['foo', NA, 'fooommm__foo', + 'mmm_', 'foommm[_]+bar'], dtype=np.object_) pat = 'mmm[_]+' result = strings.str_contains(values, pat) - expected = [False, NA, True, True, False] - tm.assert_almost_equal(result, expected) + expected = np.array([False, NA, True, True, False], dtype=np.object_) + tm.assert_numpy_array_equal(result, expected) result = strings.str_contains(values, pat, regex=False) - expected = [False, NA, False, False, True] - tm.assert_almost_equal(result, expected) + expected = np.array([False, NA, False, False, True], dtype=np.object_) + tm.assert_numpy_array_equal(result, expected) values = ['foo', 'xyz', 'fooommm__foo', 'mmm_'] result = strings.str_contains(values, pat) - expected = [False, False, True, True] + expected = np.array([False, False, True, True]) self.assertEqual(result.dtype, np.bool_) - tm.assert_almost_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) # case insensitive using regex values = ['Foo', 'xYz', 'fOOomMm__fOo', 'MMM_'] result = strings.str_contains(values, 'FOO|mmm', case=False) - expected = [True, False, True, True] - tm.assert_almost_equal(result, expected) + expected = np.array([True, False, True, True]) + tm.assert_numpy_array_equal(result, expected) # case insensitive without regex result = strings.str_contains(values, 'foo', regex=False, case=False) - expected = [True, False, True, False] - tm.assert_almost_equal(result, expected) + expected = np.array([True, False, True, False]) + tm.assert_numpy_array_equal(result, expected) # mixed mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.] rs = strings.str_contains(mixed, 'o') - xp = Series([False, NA, False, NA, NA, True, NA, NA, NA]) - tm.assert_almost_equal(rs, xp) + xp = np.array([False, NA, False, NA, NA, True, NA, NA, NA], + dtype=np.object_) + tm.assert_numpy_array_equal(rs, xp) rs = Series(mixed).str.contains('o') + xp = Series([False, NA, False, NA, NA, True, NA, NA, NA]) tm.assertIsInstance(rs, Series) tm.assert_series_equal(rs, xp) # unicode - values = [u('foo'), NA, u('fooommm__foo'), u('mmm_')] + values = np.array([u'foo', NA, u'fooommm__foo', u'mmm_'], + dtype=np.object_) pat = 'mmm[_]+' result = strings.str_contains(values, pat) - expected = [False, np.nan, True, True] - tm.assert_almost_equal(result, expected) + expected = np.array([False, np.nan, True, True], dtype=np.object_) + tm.assert_numpy_array_equal(result, expected) result = strings.str_contains(values, pat, na=False) - expected = [False, False, True, True] - tm.assert_almost_equal(result, expected) + expected = np.array([False, False, True, True]) + tm.assert_numpy_array_equal(result, expected) - values = ['foo', 'xyz', 'fooommm__foo', 'mmm_'] + values = np.array(['foo', 'xyz', 'fooommm__foo', 'mmm_'], + dtype=np.object_) result = strings.str_contains(values, pat) - expected = [False, False, True, True] + expected = np.array([False, False, True, True]) self.assertEqual(result.dtype, np.bool_) - tm.assert_almost_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) # na values = Series(['om', 'foo', np.nan]) @@ -232,13 +240,16 @@ def test_startswith(self): tm.assert_series_equal(result, exp) # mixed - mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.] + mixed = np.array(['a', NA, 'b', True, datetime.today(), + 'foo', None, 1, 2.], dtype=np.object_) rs = strings.str_startswith(mixed, 'f') - xp = Series([False, NA, False, NA, NA, True, NA, NA, NA]) - tm.assert_almost_equal(rs, xp) + xp = np.array([False, NA, False, NA, NA, True, NA, NA, NA], + dtype=np.object_) + tm.assert_numpy_array_equal(rs, xp) rs = Series(mixed).str.startswith('f') tm.assertIsInstance(rs, Series) + xp = Series([False, NA, False, NA, NA, True, NA, NA, NA]) tm.assert_series_equal(rs, xp) # unicode @@ -262,10 +273,12 @@ def test_endswith(self): # mixed mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.] rs = strings.str_endswith(mixed, 'f') - xp = Series([False, NA, False, NA, NA, False, NA, NA, NA]) - tm.assert_almost_equal(rs, xp) + xp = np.array([False, NA, False, NA, NA, False, NA, NA, NA], + dtype=np.object_) + tm.assert_numpy_array_equal(rs, xp) rs = Series(mixed).str.endswith('f') + xp = Series([False, NA, False, NA, NA, False, NA, NA, NA]) tm.assertIsInstance(rs, Series) tm.assert_series_equal(rs, xp) @@ -573,8 +586,13 @@ def test_extract_expand_False(self): # single group renames series/index properly s_or_idx = klass(['A1', 'A2']) result = s_or_idx.str.extract(r'(?PA)\d', expand=False) - tm.assert_equal(result.name, 'uno') - tm.assert_numpy_array_equal(result, klass(['A', 'A'])) + self.assertEqual(result.name, 'uno') + + exp = klass(['A', 'A'], name='uno') + if klass == Series: + tm.assert_series_equal(result, exp) + else: + tm.assert_index_equal(result, exp) s = Series(['A1', 'B2', 'C3']) # one group, no matches @@ -713,8 +731,9 @@ def test_extract_expand_True(self): # single group renames series/index properly s_or_idx = klass(['A1', 'A2']) result_df = s_or_idx.str.extract(r'(?PA)\d', expand=True) + tm.assertIsInstance(result_df, DataFrame) result_series = result_df['uno'] - tm.assert_numpy_array_equal(result_series, klass(['A', 'A'])) + assert_series_equal(result_series, Series(['A', 'A'], name='uno')) def test_extract_series(self): # extract should give the same result whether or not the @@ -982,6 +1001,30 @@ def test_extractall_no_matches(self): "second"]) tm.assert_frame_equal(r, e) + def test_extractall_stringindex(self): + s = Series(["a1a2", "b1", "c1"], name='xxx') + res = s.str.extractall("[ab](?P\d)") + exp_idx = MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0)], + names=[None, 'match']) + exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx) + tm.assert_frame_equal(res, exp) + + # index should return the same result as the default index without name + # thus index.name doesn't affect to the result + for idx in [Index(["a1a2", "b1", "c1"]), + Index(["a1a2", "b1", "c1"], name='xxx')]: + + res = idx.str.extractall("[ab](?P\d)") + tm.assert_frame_equal(res, exp) + + s = Series(["a1a2", "b1", "c1"], name='s_name', + index=Index(["XX", "yy", "zz"], name='idx_name')) + res = s.str.extractall("[ab](?P\d)") + exp_idx = MultiIndex.from_tuples([("XX", 0), ("XX", 1), ("yy", 0)], + names=["idx_name", 'match']) + exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx) + tm.assert_frame_equal(res, exp) + def test_extractall_errors(self): # Does not make sense to use extractall with a regex that has # no capture groups. (it returns DataFrame with one column for @@ -991,8 +1034,8 @@ def test_extractall_errors(self): s.str.extractall(r'[a-z]') def test_extract_index_one_two_groups(self): - s = Series( - ['a3', 'b3', 'd4c2'], ["A3", "B3", "D4"], name='series_name') + s = Series(['a3', 'b3', 'd4c2'], index=["A3", "B3", "D4"], + name='series_name') r = s.index.str.extract(r'([A-Z])', expand=True) e = DataFrame(['A', "B", "D"]) tm.assert_frame_equal(r, e) @@ -1081,7 +1124,7 @@ def test_empty_str_methods(self): # (extract) on empty series tm.assert_series_equal(empty_str, empty.str.cat(empty)) - tm.assert_equal('', empty.str.cat()) + self.assertEqual('', empty.str.cat()) tm.assert_series_equal(empty_str, empty.str.title()) tm.assert_series_equal(empty_int, empty.str.count('a')) tm.assert_series_equal(empty_bool, empty.str.contains('a')) @@ -1398,41 +1441,48 @@ def test_find_nan(self): tm.assert_series_equal(result, Series([4, np.nan, -1, np.nan, -1])) def test_index(self): + + def _check(result, expected): + if isinstance(result, Series): + tm.assert_series_equal(result, expected) + else: + tm.assert_index_equal(result, expected) + for klass in [Series, Index]: s = klass(['ABCDEFG', 'BCDEFEF', 'DEFGHIJEF', 'EFGHEF']) result = s.str.index('EF') - tm.assert_numpy_array_equal(result, klass([4, 3, 1, 0])) + _check(result, klass([4, 3, 1, 0])) expected = np.array([v.index('EF') for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = s.str.rindex('EF') - tm.assert_numpy_array_equal(result, klass([4, 5, 7, 4])) + _check(result, klass([4, 5, 7, 4])) expected = np.array([v.rindex('EF') for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = s.str.index('EF', 3) - tm.assert_numpy_array_equal(result, klass([4, 3, 7, 4])) + _check(result, klass([4, 3, 7, 4])) expected = np.array([v.index('EF', 3) for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = s.str.rindex('EF', 3) - tm.assert_numpy_array_equal(result, klass([4, 5, 7, 4])) + _check(result, klass([4, 5, 7, 4])) expected = np.array([v.rindex('EF', 3) for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = s.str.index('E', 4, 8) - tm.assert_numpy_array_equal(result, klass([4, 5, 7, 4])) + _check(result, klass([4, 5, 7, 4])) expected = np.array([v.index('E', 4, 8) for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = s.str.rindex('E', 0, 5) - tm.assert_numpy_array_equal(result, klass([4, 3, 1, 4])) + _check(result, klass([4, 3, 1, 4])) expected = np.array([v.rindex('E', 0, 5) for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) @@ -1447,9 +1497,9 @@ def test_index(self): # test with nan s = Series(['abcb', 'ab', 'bcbe', np.nan]) result = s.str.index('b') - tm.assert_numpy_array_equal(result, Series([1, 1, 0, np.nan])) + tm.assert_series_equal(result, Series([1, 1, 0, np.nan])) result = s.str.rindex('b') - tm.assert_numpy_array_equal(result, Series([3, 1, 2, np.nan])) + tm.assert_series_equal(result, Series([3, 1, 2, np.nan])) def test_pad(self): values = Series(['a', 'b', NA, 'c', NA, 'eeeeee']) @@ -1534,6 +1584,13 @@ def test_pad_fillchar(self): result = values.str.pad(5, fillchar=5) def test_translate(self): + + def _check(result, expected): + if isinstance(result, Series): + tm.assert_series_equal(result, expected) + else: + tm.assert_index_equal(result, expected) + for klass in [Series, Index]: s = klass(['abcdefg', 'abcc', 'cdddfg', 'cdefggg']) if not compat.PY3: @@ -1543,17 +1600,17 @@ def test_translate(self): table = str.maketrans('abc', 'cde') result = s.str.translate(table) expected = klass(['cdedefg', 'cdee', 'edddfg', 'edefggg']) - tm.assert_numpy_array_equal(result, expected) + _check(result, expected) # use of deletechars is python 2 only if not compat.PY3: result = s.str.translate(table, deletechars='fg') expected = klass(['cdede', 'cdee', 'eddd', 'ede']) - tm.assert_numpy_array_equal(result, expected) + _check(result, expected) result = s.str.translate(None, deletechars='fg') expected = klass(['abcde', 'abcc', 'cddd', 'cde']) - tm.assert_numpy_array_equal(result, expected) + _check(result, expected) else: with tm.assertRaisesRegexp( ValueError, "deletechars is not a valid argument"): @@ -1563,7 +1620,7 @@ def test_translate(self): s = Series(['a', 'b', 'c', 1.2]) expected = Series(['c', 'd', 'e', np.nan]) result = s.str.translate(table) - tm.assert_numpy_array_equal(result, expected) + tm.assert_series_equal(result, expected) def test_center_ljust_rjust(self): values = Series(['a', 'b', NA, 'c', NA, 'eeeeee']) @@ -1961,8 +2018,8 @@ def test_rsplit_to_multiindex_expand(self): idx = Index(['some_equal_splits', 'with_no_nans']) result = idx.str.rsplit('_', expand=True, n=1) - exp = MultiIndex.from_tuples([('some_equal', 'splits'), ('with_no', - 'nans')]) + exp = MultiIndex.from_tuples([('some_equal', 'splits'), + ('with_no', 'nans')]) tm.assert_index_equal(result, exp) self.assertEqual(result.nlevels, 2) @@ -1972,7 +2029,7 @@ def test_split_with_name(self): # should preserve name s = Series(['a,b', 'c,d'], name='xxx') res = s.str.split(',') - exp = Series([('a', 'b'), ('c', 'd')], name='xxx') + exp = Series([['a', 'b'], ['c', 'd']], name='xxx') tm.assert_series_equal(res, exp) res = s.str.split(',', expand=True) @@ -1994,60 +2051,60 @@ def test_partition_series(self): values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) result = values.str.partition('_', expand=False) - exp = Series([['a', '_', 'b_c'], ['c', '_', 'd_e'], NA, ['f', '_', - 'g_h']]) + exp = Series([('a', '_', 'b_c'), ('c', '_', 'd_e'), NA, + ('f', '_', 'g_h')]) tm.assert_series_equal(result, exp) result = values.str.rpartition('_', expand=False) - exp = Series([['a_b', '_', 'c'], ['c_d', '_', 'e'], NA, ['f_g', '_', - 'h']]) + exp = Series([('a_b', '_', 'c'), ('c_d', '_', 'e'), NA, + ('f_g', '_', 'h')]) tm.assert_series_equal(result, exp) # more than one char values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h']) result = values.str.partition('__', expand=False) - exp = Series([['a', '__', 'b__c'], ['c', '__', 'd__e'], NA, ['f', '__', - 'g__h']]) + exp = Series([('a', '__', 'b__c'), ('c', '__', 'd__e'), NA, + ('f', '__', 'g__h')]) tm.assert_series_equal(result, exp) result = values.str.rpartition('__', expand=False) - exp = Series([['a__b', '__', 'c'], ['c__d', '__', 'e'], NA, - ['f__g', '__', 'h']]) + exp = Series([('a__b', '__', 'c'), ('c__d', '__', 'e'), NA, + ('f__g', '__', 'h')]) tm.assert_series_equal(result, exp) # None values = Series(['a b c', 'c d e', NA, 'f g h']) result = values.str.partition(expand=False) - exp = Series([['a', ' ', 'b c'], ['c', ' ', 'd e'], NA, ['f', ' ', - 'g h']]) + exp = Series([('a', ' ', 'b c'), ('c', ' ', 'd e'), NA, + ('f', ' ', 'g h')]) tm.assert_series_equal(result, exp) result = values.str.rpartition(expand=False) - exp = Series([['a b', ' ', 'c'], ['c d', ' ', 'e'], NA, ['f g', ' ', - 'h']]) + exp = Series([('a b', ' ', 'c'), ('c d', ' ', 'e'), NA, + ('f g', ' ', 'h')]) tm.assert_series_equal(result, exp) # Not splited values = Series(['abc', 'cde', NA, 'fgh']) result = values.str.partition('_', expand=False) - exp = Series([['abc', '', ''], ['cde', '', ''], NA, ['fgh', '', '']]) + exp = Series([('abc', '', ''), ('cde', '', ''), NA, ('fgh', '', '')]) tm.assert_series_equal(result, exp) result = values.str.rpartition('_', expand=False) - exp = Series([['', '', 'abc'], ['', '', 'cde'], NA, ['', '', 'fgh']]) + exp = Series([('', '', 'abc'), ('', '', 'cde'), NA, ('', '', 'fgh')]) tm.assert_series_equal(result, exp) # unicode - values = Series([u('a_b_c'), u('c_d_e'), NA, u('f_g_h')]) + values = Series([u'a_b_c', u'c_d_e', NA, u'f_g_h']) result = values.str.partition('_', expand=False) - exp = Series([[u('a'), u('_'), u('b_c')], [u('c'), u('_'), u('d_e')], - NA, [u('f'), u('_'), u('g_h')]]) + exp = Series([(u'a', u'_', u'b_c'), (u'c', u'_', u'd_e'), + NA, (u'f', u'_', u'g_h')]) tm.assert_series_equal(result, exp) result = values.str.rpartition('_', expand=False) - exp = Series([[u('a_b'), u('_'), u('c')], [u('c_d'), u('_'), u('e')], - NA, [u('f_g'), u('_'), u('h')]]) + exp = Series([(u'a_b', u'_', u'c'), (u'c_d', u'_', u'e'), + NA, (u'f_g', u'_', u'h')]) tm.assert_series_equal(result, exp) # compare to standard lib diff --git a/pandas/tests/test_testing.py b/pandas/tests/test_testing.py index 9294bccce013f..c4e864a909c03 100644 --- a/pandas/tests/test_testing.py +++ b/pandas/tests/test_testing.py @@ -43,6 +43,8 @@ def test_assert_almost_equal_numbers(self): def test_assert_almost_equal_numbers_with_zeros(self): self._assert_almost_equal_both(0, 0) + self._assert_almost_equal_both(0, 0.0) + self._assert_almost_equal_both(0, np.float64(0)) self._assert_almost_equal_both(0.000001, 0) self._assert_not_almost_equal_both(0.001, 0) @@ -65,9 +67,8 @@ def test_assert_almost_equal_dicts(self): self._assert_almost_equal_both({'a': 1, 'b': 2}, {'a': 1, 'b': 2}) self._assert_not_almost_equal_both({'a': 1, 'b': 2}, {'a': 1, 'b': 3}) - self._assert_not_almost_equal_both( - {'a': 1, 'b': 2}, {'a': 1, 'b': 2, 'c': 3} - ) + self._assert_not_almost_equal_both({'a': 1, 'b': 2}, + {'a': 1, 'b': 2, 'c': 3}) self._assert_not_almost_equal_both({'a': 1}, 1) self._assert_not_almost_equal_both({'a': 1}, 'abc') self._assert_not_almost_equal_both({'a': 1}, [1, ]) @@ -82,9 +83,11 @@ def __getitem__(self, item): if item == 'a': return 1 - self._assert_almost_equal_both({'a': 1}, DictLikeObj()) + self._assert_almost_equal_both({'a': 1}, DictLikeObj(), + check_dtype=False) - self._assert_not_almost_equal_both({'a': 2}, DictLikeObj()) + self._assert_not_almost_equal_both({'a': 2}, DictLikeObj(), + check_dtype=False) def test_assert_almost_equal_strings(self): self._assert_almost_equal_both('abc', 'abc') @@ -96,7 +99,13 @@ def test_assert_almost_equal_strings(self): def test_assert_almost_equal_iterables(self): self._assert_almost_equal_both([1, 2, 3], [1, 2, 3]) - self._assert_almost_equal_both(np.array([1, 2, 3]), [1, 2, 3]) + self._assert_almost_equal_both(np.array([1, 2, 3]), + np.array([1, 2, 3])) + + # class / dtype are different + self._assert_not_almost_equal_both(np.array([1, 2, 3]), [1, 2, 3]) + self._assert_not_almost_equal_both(np.array([1, 2, 3]), + np.array([1., 2., 3.])) # Can't compare generators self._assert_not_almost_equal_both(iter([1, 2, 3]), [1, 2, 3]) @@ -107,8 +116,8 @@ def test_assert_almost_equal_iterables(self): def test_assert_almost_equal_null(self): self._assert_almost_equal_both(None, None) - self._assert_almost_equal_both(None, np.NaN) + self._assert_not_almost_equal_both(None, np.NaN) self._assert_not_almost_equal_both(None, 0) self._assert_not_almost_equal_both(np.NaN, 0) @@ -177,7 +186,7 @@ def test_numpy_array_equal_message(self): assert_almost_equal(np.array([1, 2]), np.array([3, 4, 5])) # scalar comparison - expected = """: 1 != 2""" + expected = """Expected type """ with assertRaisesRegexp(AssertionError, expected): assert_numpy_array_equal(1, 2) expected = """expected 2\\.00000 but got 1\\.00000, with decimal 5""" @@ -192,6 +201,7 @@ def test_numpy_array_equal_message(self): \\[right\\]: int""" with assertRaisesRegexp(AssertionError, expected): + # numpy_array_equal only accepts np.ndarray assert_numpy_array_equal(np.array([1]), 1) with assertRaisesRegexp(AssertionError, expected): assert_almost_equal(np.array([1]), 1) @@ -215,11 +225,11 @@ def test_numpy_array_equal_message(self): \\[right\\]: \\[1\\.0, nan, 3\\.0\\]""" with assertRaisesRegexp(AssertionError, expected): - assert_numpy_array_equal( - np.array([np.nan, 2, 3]), np.array([1, np.nan, 3])) + assert_numpy_array_equal(np.array([np.nan, 2, 3]), + np.array([1, np.nan, 3])) with assertRaisesRegexp(AssertionError, expected): - assert_almost_equal( - np.array([np.nan, 2, 3]), np.array([1, np.nan, 3])) + assert_almost_equal(np.array([np.nan, 2, 3]), + np.array([1, np.nan, 3])) expected = """numpy array are different @@ -339,8 +349,8 @@ def test_index_equal_message(self): labels=\\[\\[0, 0, 1, 1\\], \\[0, 1, 2, 3\\]\\]\\)""" idx1 = pd.Index([1, 2, 3]) - idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), ('B', 3), ('B', 4 - )]) + idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), + ('B', 3), ('B', 4)]) with assertRaisesRegexp(AssertionError, expected): assert_index_equal(idx1, idx2, exact=False) @@ -350,10 +360,10 @@ def test_index_equal_message(self): \\[left\\]: Int64Index\\(\\[2, 2, 3, 4\\], dtype='int64'\\) \\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""" - idx1 = pd.MultiIndex.from_tuples([('A', 2), ('A', 2), ('B', 3), ('B', 4 - )]) - idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), ('B', 3), ('B', 4 - )]) + idx1 = pd.MultiIndex.from_tuples([('A', 2), ('A', 2), + ('B', 3), ('B', 4)]) + idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), + ('B', 3), ('B', 4)]) with assertRaisesRegexp(AssertionError, expected): assert_index_equal(idx1, idx2) with assertRaisesRegexp(AssertionError, expected): @@ -434,10 +444,10 @@ def test_index_equal_message(self): \\[left\\]: Int64Index\\(\\[2, 2, 3, 4\\], dtype='int64'\\) \\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""" - idx1 = pd.MultiIndex.from_tuples([('A', 2), ('A', 2), ('B', 3), ('B', 4 - )]) - idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), ('B', 3), ('B', 4 - )]) + idx1 = pd.MultiIndex.from_tuples([('A', 2), ('A', 2), + ('B', 3), ('B', 4)]) + idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), + ('B', 3), ('B', 4)]) with assertRaisesRegexp(AssertionError, expected): assert_index_equal(idx1, idx2) with assertRaisesRegexp(AssertionError, expected): @@ -509,12 +519,18 @@ def test_less_precise(self): self.assertRaises(AssertionError, assert_series_equal, s1, s2) self._assert_equal(s1, s2, check_less_precise=True) + for i in range(4): + self._assert_equal(s1, s2, check_less_precise=i) + self.assertRaises(AssertionError, assert_series_equal, s1, s2, 10) s1 = Series([0.12345], dtype='float32') s2 = Series([0.12346], dtype='float32') self.assertRaises(AssertionError, assert_series_equal, s1, s2) self._assert_equal(s1, s2, check_less_precise=True) + for i in range(4): + self._assert_equal(s1, s2, check_less_precise=i) + self.assertRaises(AssertionError, assert_series_equal, s1, s2, 10) # even less than less precise s1 = Series([0.1235], dtype='float32') @@ -674,6 +690,45 @@ def test_notisinstance(self): tm.assertNotIsInstance(pd.Series([1]), pd.Series) +class TestAssertCategoricalEqual(unittest.TestCase): + _multiprocess_can_split_ = True + + def test_categorical_equal_message(self): + + expected = """Categorical\\.categories are different + +Categorical\\.categories values are different \\(25\\.0 %\\) +\\[left\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\) +\\[right\\]: Int64Index\\(\\[1, 2, 3, 5\\], dtype='int64'\\)""" + + a = pd.Categorical([1, 2, 3, 4]) + b = pd.Categorical([1, 2, 3, 5]) + with assertRaisesRegexp(AssertionError, expected): + tm.assert_categorical_equal(a, b) + + expected = """Categorical\\.codes are different + +Categorical\\.codes values are different \\(50\\.0 %\\) +\\[left\\]: \\[0, 1, 3, 2\\] +\\[right\\]: \\[0, 1, 2, 3\\]""" + + a = pd.Categorical([1, 2, 4, 3], categories=[1, 2, 3, 4]) + b = pd.Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4]) + with assertRaisesRegexp(AssertionError, expected): + tm.assert_categorical_equal(a, b) + + expected = """Categorical are different + +Attribute "ordered" are different +\\[left\\]: False +\\[right\\]: True""" + + a = pd.Categorical([1, 2, 3, 4], ordered=False) + b = pd.Categorical([1, 2, 3, 4], ordered=True) + with assertRaisesRegexp(AssertionError, expected): + tm.assert_categorical_equal(a, b) + + class TestRNGContext(unittest.TestCase): def test_RNGContext(self): diff --git a/pandas/tests/test_tseries.py b/pandas/tests/test_tseries.py deleted file mode 100644 index 854b7295aece4..0000000000000 --- a/pandas/tests/test_tseries.py +++ /dev/null @@ -1,711 +0,0 @@ -# -*- coding: utf-8 -*- -from numpy import nan -import numpy as np -from pandas import Index, isnull, Timestamp -from pandas.util.testing import assert_almost_equal -import pandas.util.testing as tm -from pandas.compat import range, lrange, zip -import pandas.lib as lib -import pandas._period as period -import pandas.algos as algos -from pandas.core import common as com -import datetime - - -class TestTseriesUtil(tm.TestCase): - _multiprocess_can_split_ = True - - def test_combineFunc(self): - pass - - def test_reindex(self): - pass - - def test_isnull(self): - pass - - def test_groupby(self): - pass - - def test_groupby_withnull(self): - pass - - def test_backfill(self): - old = Index([1, 5, 10]) - new = Index(lrange(12)) - - filler = algos.backfill_int64(old.values, new.values) - - expect_filler = [0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, -1] - self.assert_numpy_array_equal(filler, expect_filler) - - # corner case - old = Index([1, 4]) - new = Index(lrange(5, 10)) - filler = algos.backfill_int64(old.values, new.values) - - expect_filler = [-1, -1, -1, -1, -1] - self.assert_numpy_array_equal(filler, expect_filler) - - def test_pad(self): - old = Index([1, 5, 10]) - new = Index(lrange(12)) - - filler = algos.pad_int64(old.values, new.values) - - expect_filler = [-1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2] - self.assert_numpy_array_equal(filler, expect_filler) - - # corner case - old = Index([5, 10]) - new = Index(lrange(5)) - filler = algos.pad_int64(old.values, new.values) - expect_filler = [-1, -1, -1, -1, -1] - self.assert_numpy_array_equal(filler, expect_filler) - - -def test_left_join_indexer_unique(): - a = np.array([1, 2, 3, 4, 5], dtype=np.int64) - b = np.array([2, 2, 3, 4, 4], dtype=np.int64) - - result = algos.left_join_indexer_unique_int64(b, a) - expected = np.array([1, 1, 2, 3, 3], dtype=np.int64) - assert (np.array_equal(result, expected)) - - -def test_left_outer_join_bug(): - left = np.array([0, 1, 0, 1, 1, 2, 3, 1, 0, 2, 1, 2, 0, 1, 1, 2, 3, 2, 3, - 2, 1, 1, 3, 0, 3, 2, 3, 0, 0, 2, 3, 2, 0, 3, 1, 3, 0, 1, - 3, 0, 0, 1, 0, 3, 1, 0, 1, 0, 1, 1, 0, 2, 2, 2, 2, 2, 0, - 3, 1, 2, 0, 0, 3, 1, 3, 2, 2, 0, 1, 3, 0, 2, 3, 2, 3, 3, - 2, 3, 3, 1, 3, 2, 0, 0, 3, 1, 1, 1, 0, 2, 3, 3, 1, 2, 0, - 3, 1, 2, 0, 2], dtype=np.int64) - - right = np.array([3, 1], dtype=np.int64) - max_groups = 4 - - lidx, ridx = algos.left_outer_join(left, right, max_groups, sort=False) - - exp_lidx = np.arange(len(left)) - exp_ridx = -np.ones(len(left)) - exp_ridx[left == 1] = 1 - exp_ridx[left == 3] = 0 - - assert (np.array_equal(lidx, exp_lidx)) - assert (np.array_equal(ridx, exp_ridx)) - - -def test_inner_join_indexer(): - a = np.array([1, 2, 3, 4, 5], dtype=np.int64) - b = np.array([0, 3, 5, 7, 9], dtype=np.int64) - - index, ares, bres = algos.inner_join_indexer_int64(a, b) - - index_exp = np.array([3, 5], dtype=np.int64) - assert_almost_equal(index, index_exp) - - aexp = np.array([2, 4], dtype=np.int64) - bexp = np.array([1, 2], dtype=np.int64) - assert_almost_equal(ares, aexp) - assert_almost_equal(bres, bexp) - - a = np.array([5], dtype=np.int64) - b = np.array([5], dtype=np.int64) - - index, ares, bres = algos.inner_join_indexer_int64(a, b) - assert_almost_equal(index, [5]) - assert_almost_equal(ares, [0]) - assert_almost_equal(bres, [0]) - - -def test_outer_join_indexer(): - a = np.array([1, 2, 3, 4, 5], dtype=np.int64) - b = np.array([0, 3, 5, 7, 9], dtype=np.int64) - - index, ares, bres = algos.outer_join_indexer_int64(a, b) - - index_exp = np.array([0, 1, 2, 3, 4, 5, 7, 9], dtype=np.int64) - assert_almost_equal(index, index_exp) - - aexp = np.array([-1, 0, 1, 2, 3, 4, -1, -1], dtype=np.int64) - bexp = np.array([0, -1, -1, 1, -1, 2, 3, 4], dtype=np.int64) - assert_almost_equal(ares, aexp) - assert_almost_equal(bres, bexp) - - a = np.array([5], dtype=np.int64) - b = np.array([5], dtype=np.int64) - - index, ares, bres = algos.outer_join_indexer_int64(a, b) - assert_almost_equal(index, [5]) - assert_almost_equal(ares, [0]) - assert_almost_equal(bres, [0]) - - -def test_left_join_indexer(): - a = np.array([1, 2, 3, 4, 5], dtype=np.int64) - b = np.array([0, 3, 5, 7, 9], dtype=np.int64) - - index, ares, bres = algos.left_join_indexer_int64(a, b) - - assert_almost_equal(index, a) - - aexp = np.array([0, 1, 2, 3, 4], dtype=np.int64) - bexp = np.array([-1, -1, 1, -1, 2], dtype=np.int64) - assert_almost_equal(ares, aexp) - assert_almost_equal(bres, bexp) - - a = np.array([5], dtype=np.int64) - b = np.array([5], dtype=np.int64) - - index, ares, bres = algos.left_join_indexer_int64(a, b) - assert_almost_equal(index, [5]) - assert_almost_equal(ares, [0]) - assert_almost_equal(bres, [0]) - - -def test_left_join_indexer2(): - idx = Index([1, 1, 2, 5]) - idx2 = Index([1, 2, 5, 7, 9]) - - res, lidx, ridx = algos.left_join_indexer_int64(idx2.values, idx.values) - - exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) - assert_almost_equal(res, exp_res) - - exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) - assert_almost_equal(lidx, exp_lidx) - - exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) - assert_almost_equal(ridx, exp_ridx) - - -def test_outer_join_indexer2(): - idx = Index([1, 1, 2, 5]) - idx2 = Index([1, 2, 5, 7, 9]) - - res, lidx, ridx = algos.outer_join_indexer_int64(idx2.values, idx.values) - - exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) - assert_almost_equal(res, exp_res) - - exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) - assert_almost_equal(lidx, exp_lidx) - - exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) - assert_almost_equal(ridx, exp_ridx) - - -def test_inner_join_indexer2(): - idx = Index([1, 1, 2, 5]) - idx2 = Index([1, 2, 5, 7, 9]) - - res, lidx, ridx = algos.inner_join_indexer_int64(idx2.values, idx.values) - - exp_res = np.array([1, 1, 2, 5], dtype=np.int64) - assert_almost_equal(res, exp_res) - - exp_lidx = np.array([0, 0, 1, 2], dtype=np.int64) - assert_almost_equal(lidx, exp_lidx) - - exp_ridx = np.array([0, 1, 2, 3], dtype=np.int64) - assert_almost_equal(ridx, exp_ridx) - - -def test_is_lexsorted(): - failure = [ - np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 3, 3, - 3, 3, - 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, - 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0]), - np.array([30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, - 15, 14, - 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, - 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, - 12, 11, - 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, 27, 26, 25, - 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, - 9, 8, - 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, 27, 26, 25, 24, 23, 22, - 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, - 6, 5, - 4, 3, 2, 1, 0])] - - assert (not algos.is_lexsorted(failure)) - -# def test_get_group_index(): -# a = np.array([0, 1, 2, 0, 2, 1, 0, 0], dtype=np.int64) -# b = np.array([1, 0, 3, 2, 0, 2, 3, 0], dtype=np.int64) -# expected = np.array([1, 4, 11, 2, 8, 6, 3, 0], dtype=np.int64) - -# result = lib.get_group_index([a, b], (3, 4)) - -# assert(np.array_equal(result, expected)) - - -def test_groupsort_indexer(): - a = np.random.randint(0, 1000, 100).astype(np.int64) - b = np.random.randint(0, 1000, 100).astype(np.int64) - - result = algos.groupsort_indexer(a, 1000)[0] - - # need to use a stable sort - expected = np.argsort(a, kind='mergesort') - assert (np.array_equal(result, expected)) - - # compare with lexsort - key = a * 1000 + b - result = algos.groupsort_indexer(key, 1000000)[0] - expected = np.lexsort((b, a)) - assert (np.array_equal(result, expected)) - - -def test_ensure_platform_int(): - arr = np.arange(100) - - result = algos.ensure_platform_int(arr) - assert (result is arr) - - -def test_duplicated_with_nas(): - keys = np.array([0, 1, nan, 0, 2, nan], dtype=object) - - result = lib.duplicated(keys) - expected = [False, False, False, True, False, True] - assert (np.array_equal(result, expected)) - - result = lib.duplicated(keys, keep='first') - expected = [False, False, False, True, False, True] - assert (np.array_equal(result, expected)) - - result = lib.duplicated(keys, keep='last') - expected = [True, False, True, False, False, False] - assert (np.array_equal(result, expected)) - - result = lib.duplicated(keys, keep=False) - expected = [True, False, True, True, False, True] - assert (np.array_equal(result, expected)) - - keys = np.empty(8, dtype=object) - for i, t in enumerate(zip([0, 0, nan, nan] * 2, [0, nan, 0, nan] * 2)): - keys[i] = t - - result = lib.duplicated(keys) - falses = [False] * 4 - trues = [True] * 4 - expected = falses + trues - assert (np.array_equal(result, expected)) - - result = lib.duplicated(keys, keep='last') - expected = trues + falses - assert (np.array_equal(result, expected)) - - result = lib.duplicated(keys, keep=False) - expected = trues + trues - assert (np.array_equal(result, expected)) - - -def test_maybe_booleans_to_slice(): - arr = np.array([0, 0, 1, 1, 1, 0, 1], dtype=np.uint8) - result = lib.maybe_booleans_to_slice(arr) - assert (result.dtype == np.bool_) - - result = lib.maybe_booleans_to_slice(arr[:0]) - assert (result == slice(0, 0)) - - -def test_convert_objects(): - arr = np.array(['a', 'b', nan, nan, 'd', 'e', 'f'], dtype='O') - result = lib.maybe_convert_objects(arr) - assert (result.dtype == np.object_) - - -def test_convert_infs(): - arr = np.array(['inf', 'inf', 'inf'], dtype='O') - result = lib.maybe_convert_numeric(arr, set(), False) - assert (result.dtype == np.float64) - - arr = np.array(['-inf', '-inf', '-inf'], dtype='O') - result = lib.maybe_convert_numeric(arr, set(), False) - assert (result.dtype == np.float64) - - -def test_scientific_no_exponent(): - # See PR 12215 - arr = np.array(['42E', '2E', '99e', '6e'], dtype='O') - result = lib.maybe_convert_numeric(arr, set(), False, True) - assert np.all(np.isnan(result)) - - -def test_convert_objects_ints(): - # test that we can detect many kinds of integers - dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'] - - for dtype_str in dtypes: - arr = np.array(list(np.arange(20, dtype=dtype_str)), dtype='O') - assert (arr[0].dtype == np.dtype(dtype_str)) - result = lib.maybe_convert_objects(arr) - assert (issubclass(result.dtype.type, np.integer)) - - -def test_convert_objects_complex_number(): - for dtype in np.sctypes['complex']: - arr = np.array(list(1j * np.arange(20, dtype=dtype)), dtype='O') - assert (arr[0].dtype == np.dtype(dtype)) - result = lib.maybe_convert_objects(arr) - assert (issubclass(result.dtype.type, np.complexfloating)) - - -def test_rank(): - tm._skip_if_no_scipy() - from scipy.stats import rankdata - - def _check(arr): - mask = ~np.isfinite(arr) - arr = arr.copy() - result = algos.rank_1d_float64(arr) - arr[mask] = np.inf - exp = rankdata(arr) - exp[mask] = nan - assert_almost_equal(result, exp) - - _check(np.array([nan, nan, 5., 5., 5., nan, 1, 2, 3, nan])) - _check(np.array([4., nan, 5., 5., 5., nan, 1, 2, 4., nan])) - - -def test_get_reverse_indexer(): - indexer = np.array([-1, -1, 1, 2, 0, -1, 3, 4], dtype=np.int64) - result = lib.get_reverse_indexer(indexer, 5) - expected = np.array([4, 2, 3, 6, 7], dtype=np.int64) - assert (np.array_equal(result, expected)) - - -def test_pad_backfill_object_segfault(): - - old = np.array([], dtype='O') - new = np.array([datetime.datetime(2010, 12, 31)], dtype='O') - - result = algos.pad_object(old, new) - expected = np.array([-1], dtype=np.int64) - assert (np.array_equal(result, expected)) - - result = algos.pad_object(new, old) - expected = np.array([], dtype=np.int64) - assert (np.array_equal(result, expected)) - - result = algos.backfill_object(old, new) - expected = np.array([-1], dtype=np.int64) - assert (np.array_equal(result, expected)) - - result = algos.backfill_object(new, old) - expected = np.array([], dtype=np.int64) - assert (np.array_equal(result, expected)) - - -def test_arrmap(): - values = np.array(['foo', 'foo', 'bar', 'bar', 'baz', 'qux'], dtype='O') - result = algos.arrmap_object(values, lambda x: x in ['foo', 'bar']) - assert (result.dtype == np.bool_) - - -def test_series_grouper(): - from pandas import Series - obj = Series(np.random.randn(10)) - dummy = obj[:0] - - labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64) - - grouper = lib.SeriesGrouper(obj, np.mean, labels, 2, dummy) - result, counts = grouper.get_result() - - expected = np.array([obj[3:6].mean(), obj[6:].mean()]) - assert_almost_equal(result, expected) - - exp_counts = np.array([3, 4], dtype=np.int64) - assert_almost_equal(counts, exp_counts) - - -def test_series_bin_grouper(): - from pandas import Series - obj = Series(np.random.randn(10)) - dummy = obj[:0] - - bins = np.array([3, 6]) - - grouper = lib.SeriesBinGrouper(obj, np.mean, bins, dummy) - result, counts = grouper.get_result() - - expected = np.array([obj[:3].mean(), obj[3:6].mean(), obj[6:].mean()]) - assert_almost_equal(result, expected) - - exp_counts = np.array([3, 3, 4], dtype=np.int64) - assert_almost_equal(counts, exp_counts) - - -class TestBinGroupers(tm.TestCase): - _multiprocess_can_split_ = True - - def setUp(self): - self.obj = np.random.randn(10, 1) - self.labels = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 2], dtype=np.int64) - self.bins = np.array([3, 6], dtype=np.int64) - - def test_generate_bins(self): - from pandas.core.groupby import generate_bins_generic - values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64) - binner = np.array([0, 3, 6, 9], dtype=np.int64) - - for func in [lib.generate_bins_dt64, generate_bins_generic]: - bins = func(values, binner, closed='left') - assert ((bins == np.array([2, 5, 6])).all()) - - bins = func(values, binner, closed='right') - assert ((bins == np.array([3, 6, 6])).all()) - - for func in [lib.generate_bins_dt64, generate_bins_generic]: - values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64) - binner = np.array([0, 3, 6], dtype=np.int64) - - bins = func(values, binner, closed='right') - assert ((bins == np.array([3, 6])).all()) - - self.assertRaises(ValueError, generate_bins_generic, values, [], - 'right') - self.assertRaises(ValueError, generate_bins_generic, values[:0], - binner, 'right') - - self.assertRaises(ValueError, generate_bins_generic, values, [4], - 'right') - self.assertRaises(ValueError, generate_bins_generic, values, [-3, -1], - 'right') - - -def test_group_ohlc(): - def _check(dtype): - obj = np.array(np.random.randn(20), dtype=dtype) - - bins = np.array([6, 12, 20]) - out = np.zeros((3, 4), dtype) - counts = np.zeros(len(out), dtype=np.int64) - labels = com._ensure_int64(np.repeat( - np.arange(3), np.diff(np.r_[0, bins]))) - - func = getattr(algos, 'group_ohlc_%s' % dtype) - func(out, counts, obj[:, None], labels) - - def _ohlc(group): - if isnull(group).all(): - return np.repeat(nan, 4) - return [group[0], group.max(), group.min(), group[-1]] - - expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), _ohlc(obj[12:]) - ]) - - assert_almost_equal(out, expected) - assert_almost_equal(counts, [6, 6, 8]) - - obj[:6] = nan - func(out, counts, obj[:, None], labels) - expected[0] = nan - assert_almost_equal(out, expected) - - _check('float32') - _check('float64') - - -def test_try_parse_dates(): - from dateutil.parser import parse - - arr = np.array(['5/1/2000', '6/1/2000', '7/1/2000'], dtype=object) - - result = lib.try_parse_dates(arr, dayfirst=True) - expected = [parse(d, dayfirst=True) for d in arr] - assert (np.array_equal(result, expected)) - - -class TestTypeInference(tm.TestCase): - _multiprocess_can_split_ = True - - def test_length_zero(self): - result = lib.infer_dtype(np.array([], dtype='i4')) - self.assertEqual(result, 'integer') - - result = lib.infer_dtype([]) - self.assertEqual(result, 'empty') - - def test_integers(self): - arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'integer') - - arr = np.array([1, 2, 3, np.int64(4), np.int32(5), 'foo'], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'mixed-integer') - - arr = np.array([1, 2, 3, 4, 5], dtype='i4') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'integer') - - def test_bools(self): - arr = np.array([True, False, True, True, True], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'boolean') - - arr = np.array([np.bool_(True), np.bool_(False)], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'boolean') - - arr = np.array([True, False, True, 'foo'], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'mixed') - - arr = np.array([True, False, True], dtype=bool) - result = lib.infer_dtype(arr) - self.assertEqual(result, 'boolean') - - def test_floats(self): - arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'floating') - - arr = np.array([1, 2, 3, np.float64(4), np.float32(5), 'foo'], - dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'mixed-integer') - - arr = np.array([1, 2, 3, 4, 5], dtype='f4') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'floating') - - arr = np.array([1, 2, 3, 4, 5], dtype='f8') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'floating') - - def test_string(self): - pass - - def test_unicode(self): - pass - - def test_datetime(self): - - dates = [datetime.datetime(2012, 1, x) for x in range(1, 20)] - index = Index(dates) - self.assertEqual(index.inferred_type, 'datetime64') - - def test_date(self): - - dates = [datetime.date(2012, 1, x) for x in range(1, 20)] - index = Index(dates) - self.assertEqual(index.inferred_type, 'date') - - def test_to_object_array_tuples(self): - r = (5, 6) - values = [r] - result = lib.to_object_array_tuples(values) - - try: - # make sure record array works - from collections import namedtuple - record = namedtuple('record', 'x y') - r = record(5, 6) - values = [r] - result = lib.to_object_array_tuples(values) # noqa - except ImportError: - pass - - def test_object(self): - - # GH 7431 - # cannot infer more than this as only a single element - arr = np.array([None], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'mixed') - - def test_categorical(self): - - # GH 8974 - from pandas import Categorical, Series - arr = Categorical(list('abc')) - result = lib.infer_dtype(arr) - self.assertEqual(result, 'categorical') - - result = lib.infer_dtype(Series(arr)) - self.assertEqual(result, 'categorical') - - arr = Categorical(list('abc'), categories=['cegfab'], ordered=True) - result = lib.infer_dtype(arr) - self.assertEqual(result, 'categorical') - - result = lib.infer_dtype(Series(arr)) - self.assertEqual(result, 'categorical') - - -class TestMoments(tm.TestCase): - pass - - -class TestReducer(tm.TestCase): - def test_int_index(self): - from pandas.core.series import Series - - arr = np.random.randn(100, 4) - result = lib.reduce(arr, np.sum, labels=Index(np.arange(4))) - expected = arr.sum(0) - assert_almost_equal(result, expected) - - result = lib.reduce(arr, np.sum, axis=1, labels=Index(np.arange(100))) - expected = arr.sum(1) - assert_almost_equal(result, expected) - - dummy = Series(0., index=np.arange(100)) - result = lib.reduce(arr, np.sum, dummy=dummy, - labels=Index(np.arange(4))) - expected = arr.sum(0) - assert_almost_equal(result, expected) - - dummy = Series(0., index=np.arange(4)) - result = lib.reduce(arr, np.sum, axis=1, dummy=dummy, - labels=Index(np.arange(100))) - expected = arr.sum(1) - assert_almost_equal(result, expected) - - result = lib.reduce(arr, np.sum, axis=1, dummy=dummy, - labels=Index(np.arange(100))) - assert_almost_equal(result, expected) - - -class TestTsUtil(tm.TestCase): - def test_min_valid(self): - # Ensure that Timestamp.min is a valid Timestamp - Timestamp(Timestamp.min) - - def test_max_valid(self): - # Ensure that Timestamp.max is a valid Timestamp - Timestamp(Timestamp.max) - - def test_to_datetime_bijective(self): - # Ensure that converting to datetime and back only loses precision - # by going from nanoseconds to microseconds. - self.assertEqual( - Timestamp(Timestamp.max.to_pydatetime()).value / 1000, - Timestamp.max.value / 1000) - self.assertEqual( - Timestamp(Timestamp.min.to_pydatetime()).value / 1000, - Timestamp.min.value / 1000) - - -class TestPeriodField(tm.TestCase): - def test_get_period_field_raises_on_out_of_range(self): - self.assertRaises(ValueError, period.get_period_field, -1, 0, 0) - - def test_get_period_field_array_raises_on_out_of_range(self): - self.assertRaises(ValueError, period.get_period_field_arr, -1, - np.empty(1), 0) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 22ac583a3b808..2ec419221c6d8 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -6,26 +6,30 @@ from nose.tools import assert_raises from datetime import datetime from numpy.random import randn -from numpy.testing.decorators import slow import numpy as np from distutils.version import LooseVersion import pandas as pd from pandas import (Series, DataFrame, Panel, bdate_range, isnull, notnull, concat) -from pandas.util.testing import (assert_almost_equal, assert_series_equal, - assert_frame_equal, assert_panel_equal, - assert_index_equal, assert_numpy_array_equal) import pandas.core.datetools as datetools import pandas.stats.moments as mom import pandas.core.window as rwindow from pandas.core.base import SpecificationError +from pandas.core.common import UnsupportedFunctionCall import pandas.util.testing as tm from pandas.compat import range, zip, PY3 N, K = 100, 10 +def assert_equal(left, right): + if isinstance(left, Series): + tm.assert_series_equal(left, right) + else: + tm.assert_frame_equal(left, right) + + class Base(tm.TestCase): _multiprocess_can_split_ = True @@ -93,11 +97,11 @@ def tests_skip_nuisance(self): expected = DataFrame({'A': [np.nan, np.nan, 3, 6, 9], 'B': [np.nan, np.nan, 18, 21, 24]}, columns=list('AB')) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) expected = pd.concat([r[['A', 'B']].sum(), df[['C']]], axis=1) result = r.sum() - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_agg(self): df = DataFrame({'A': range(5), 'B': range(0, 10, 2)}) @@ -114,50 +118,51 @@ def test_agg(self): expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_product([['A', 'B'], ['mean', 'std']]) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = r.aggregate({'A': np.mean, 'B': np.std}) expected = pd.concat([a_mean, b_std], axis=1) - assert_frame_equal(result, expected, check_like=True) + tm.assert_frame_equal(result, expected, check_like=True) result = r.aggregate({'A': ['mean', 'std']}) expected = pd.concat([a_mean, a_std], axis=1) expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ('A', 'std')]) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = r['A'].aggregate(['mean', 'sum']) expected = pd.concat([a_mean, a_sum], axis=1) expected.columns = ['mean', 'sum'] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = r.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}}) expected = pd.concat([a_mean, a_sum], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ('A', - 'sum')]) - assert_frame_equal(result, expected, check_like=True) + expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), + ('A', 'sum')]) + tm.assert_frame_equal(result, expected, check_like=True) result = r.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}, 'B': {'mean2': 'mean', 'sum2': 'sum'}}) expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ( - 'A', 'sum'), ('B', 'mean2'), ('B', 'sum2')]) - assert_frame_equal(result, expected, check_like=True) + exp_cols = [('A', 'mean'), ('A', 'sum'), ('B', 'mean2'), ('B', 'sum2')] + expected.columns = pd.MultiIndex.from_tuples(exp_cols) + tm.assert_frame_equal(result, expected, check_like=True) result = r.aggregate({'A': ['mean', 'std'], 'B': ['mean', 'std']}) expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ( - 'A', 'std'), ('B', 'mean'), ('B', 'std')]) - assert_frame_equal(result, expected, check_like=True) + + exp_cols = [('A', 'mean'), ('A', 'std'), ('B', 'mean'), ('B', 'std')] + expected.columns = pd.MultiIndex.from_tuples(exp_cols) + tm.assert_frame_equal(result, expected, check_like=True) # passed lambda result = r.agg({'A': np.sum, 'B': lambda x: np.std(x, ddof=1)}) rcustom = r['B'].apply(lambda x: np.std(x, ddof=1)) expected = pd.concat([a_sum, rcustom], axis=1) - assert_frame_equal(result, expected, check_like=True) + tm.assert_frame_equal(result, expected, check_like=True) def test_agg_consistency(self): @@ -194,13 +199,13 @@ def f(): 'ra', 'std'), ('rb', 'mean'), ('rb', 'std')]) result = r[['A', 'B']].agg({'A': {'ra': ['mean', 'std']}, 'B': {'rb': ['mean', 'std']}}) - assert_frame_equal(result, expected, check_like=True) + tm.assert_frame_equal(result, expected, check_like=True) result = r.agg({'A': {'ra': ['mean', 'std']}, 'B': {'rb': ['mean', 'std']}}) expected.columns = pd.MultiIndex.from_tuples([('A', 'ra', 'mean'), ( 'A', 'ra', 'std'), ('B', 'rb', 'mean'), ('B', 'rb', 'std')]) - assert_frame_equal(result, expected, check_like=True) + tm.assert_frame_equal(result, expected, check_like=True) def test_window_with_args(self): tm._skip_if_no_scipy() @@ -212,7 +217,7 @@ def test_window_with_args(self): expected.columns = ['', ''] result = r.aggregate([lambda x: x.mean(std=10), lambda x: x.mean(std=.01)]) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def a(x): return x.mean(std=10) @@ -223,7 +228,7 @@ def b(x): expected = pd.concat([r.mean(std=10), r.mean(std=.01)], axis=1) expected.columns = ['a', 'b'] result = r.aggregate([a, b]) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_preserve_metadata(self): # GH 10565 @@ -261,7 +266,7 @@ def test_how_compat(self): expected = getattr( getattr(s, t)(freq='D', **kwargs), op)(how=how) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) class TestWindow(Base): @@ -296,6 +301,18 @@ def test_constructor(self): with self.assertRaises(ValueError): c(win_type=wt, window=2) + def test_numpy_compat(self): + # see gh-12811 + w = rwindow.Window(Series([2, 4, 6]), window=[0, 2]) + + msg = "numpy operations are not valid with window objects" + + for func in ('sum', 'mean'): + tm.assertRaisesRegexp(UnsupportedFunctionCall, msg, + getattr(w, func), 1, 2, 3) + tm.assertRaisesRegexp(UnsupportedFunctionCall, msg, + getattr(w, func), dtype=np.float64) + class TestRolling(Base): @@ -323,6 +340,18 @@ def test_constructor(self): with self.assertRaises(ValueError): c(window=2, min_periods=1, center=w) + def test_numpy_compat(self): + # see gh-12811 + r = rwindow.Rolling(Series([2, 4, 6]), window=2) + + msg = "numpy operations are not valid with window objects" + + for func in ('std', 'mean', 'sum', 'max', 'min', 'var'): + tm.assertRaisesRegexp(UnsupportedFunctionCall, msg, + getattr(r, func), 1, 2, 3) + tm.assertRaisesRegexp(UnsupportedFunctionCall, msg, + getattr(r, func), dtype=np.float64) + class TestExpanding(Base): @@ -347,6 +376,74 @@ def test_constructor(self): with self.assertRaises(ValueError): c(min_periods=1, center=w) + def test_numpy_compat(self): + # see gh-12811 + e = rwindow.Expanding(Series([2, 4, 6]), window=2) + + msg = "numpy operations are not valid with window objects" + + for func in ('std', 'mean', 'sum', 'max', 'min', 'var'): + tm.assertRaisesRegexp(UnsupportedFunctionCall, msg, + getattr(e, func), 1, 2, 3) + tm.assertRaisesRegexp(UnsupportedFunctionCall, msg, + getattr(e, func), dtype=np.float64) + + +class TestEWM(Base): + + def setUp(self): + self._create_data() + + def test_constructor(self): + for o in [self.series, self.frame]: + c = o.ewm + + # valid + c(com=0.5) + c(span=1.5) + c(alpha=0.5) + c(halflife=0.75) + c(com=0.5, span=None) + c(alpha=0.5, com=None) + c(halflife=0.75, alpha=None) + + # not valid: mutually exclusive + with self.assertRaises(ValueError): + c(com=0.5, alpha=0.5) + with self.assertRaises(ValueError): + c(span=1.5, halflife=0.75) + with self.assertRaises(ValueError): + c(alpha=0.5, span=1.5) + + # not valid: com < 0 + with self.assertRaises(ValueError): + c(com=-0.5) + + # not valid: span < 1 + with self.assertRaises(ValueError): + c(span=0.5) + + # not valid: halflife <= 0 + with self.assertRaises(ValueError): + c(halflife=0) + + # not valid: alpha <= 0 or alpha > 1 + for alpha in (-0.5, 1.5): + with self.assertRaises(ValueError): + c(alpha=alpha) + + def test_numpy_compat(self): + # see gh-12811 + e = rwindow.EWM(Series([2, 4, 6]), alpha=0.5) + + msg = "numpy operations are not valid with window objects" + + for func in ('std', 'mean', 'var'): + tm.assertRaisesRegexp(UnsupportedFunctionCall, msg, + getattr(e, func), 1, 2, 3) + tm.assertRaisesRegexp(UnsupportedFunctionCall, msg, + getattr(e, func), dtype=np.float64) + class TestDeprecations(Base): """ test that we are catching deprecation warnings """ @@ -462,7 +559,7 @@ def test_dtypes(self): def check_dtypes(self, f, f_name, d, d_name, exp): roll = d.rolling(window=self.window) result = f(roll) - assert_almost_equal(result, exp) + tm.assert_almost_equal(result, exp) class TestDtype_object(Dtype): @@ -549,7 +646,7 @@ def check_dtypes(self, f, f_name, d, d_name, exp): if f_name == 'count': result = f(roll) - assert_almost_equal(result, exp) + tm.assert_almost_equal(result, exp) else: @@ -621,11 +718,11 @@ def test_cmov_mean(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): rs = mom.rolling_mean(vals, 5, center=True) - assert_almost_equal(xp, rs) + tm.assert_almost_equal(xp, rs) xp = Series(rs) rs = Series(vals).rolling(5, center=True).mean() - assert_series_equal(xp, rs) + tm.assert_series_equal(xp, rs) def test_cmov_window(self): # GH 8238 @@ -638,11 +735,11 @@ def test_cmov_window(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): rs = mom.rolling_window(vals, 5, 'boxcar', center=True) - assert_almost_equal(xp, rs) + tm.assert_almost_equal(xp, rs) xp = Series(rs) rs = Series(vals).rolling(5, win_type='boxcar', center=True).mean() - assert_series_equal(xp, rs) + tm.assert_series_equal(xp, rs) def test_cmov_window_corner(self): # GH 8238 @@ -684,7 +781,7 @@ def test_cmov_window_frame(self): # DataFrame rs = DataFrame(vals).rolling(5, win_type='boxcar', center=True).mean() - assert_frame_equal(DataFrame(xp), rs) + tm.assert_frame_equal(DataFrame(xp), rs) # invalid method with self.assertRaises(AttributeError): @@ -698,7 +795,7 @@ def test_cmov_window_frame(self): ], [np.nan, np.nan]]) rs = DataFrame(vals).rolling(5, win_type='boxcar', center=True).sum() - assert_frame_equal(DataFrame(xp), rs) + tm.assert_frame_equal(DataFrame(xp), rs) def test_cmov_window_na_min_periods(self): tm._skip_if_no_scipy() @@ -711,7 +808,7 @@ def test_cmov_window_na_min_periods(self): xp = vals.rolling(5, min_periods=4, center=True).mean() rs = vals.rolling(5, win_type='boxcar', min_periods=4, center=True).mean() - assert_series_equal(xp, rs) + tm.assert_series_equal(xp, rs) def test_cmov_window_regular(self): # GH 8238 @@ -744,7 +841,7 @@ def test_cmov_window_regular(self): for wt in win_types: xp = Series(xps[wt]) rs = Series(vals).rolling(5, win_type=wt, center=True).mean() - assert_series_equal(xp, rs) + tm.assert_series_equal(xp, rs) def test_cmov_window_regular_linear_range(self): # GH 8238 @@ -761,7 +858,7 @@ def test_cmov_window_regular_linear_range(self): for wt in win_types: rs = Series(vals).rolling(5, win_type=wt, center=True).mean() - assert_series_equal(xp, rs) + tm.assert_series_equal(xp, rs) def test_cmov_window_regular_missing_data(self): # GH 8238 @@ -794,7 +891,7 @@ def test_cmov_window_regular_missing_data(self): for wt in win_types: xp = Series(xps[wt]) rs = Series(vals).rolling(5, win_type=wt, min_periods=3).mean() - assert_series_equal(xp, rs) + tm.assert_series_equal(xp, rs) def test_cmov_window_special(self): # GH 8238 @@ -821,7 +918,7 @@ def test_cmov_window_special(self): for wt, k in zip(win_types, kwds): xp = Series(xps[wt]) rs = Series(vals).rolling(5, win_type=wt, center=True).mean(**k) - assert_series_equal(xp, rs) + tm.assert_series_equal(xp, rs) def test_cmov_window_special_linear_range(self): # GH 8238 @@ -839,7 +936,7 @@ def test_cmov_window_special_linear_range(self): for wt, k in zip(win_types, kwds): rs = Series(vals).rolling(5, win_type=wt, center=True).mean(**k) - assert_series_equal(xp, rs) + tm.assert_series_equal(xp, rs) def test_rolling_median(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): @@ -853,7 +950,7 @@ def test_rolling_min(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): a = np.array([1, 2, 3, 4, 5]) b = mom.rolling_min(a, window=100, min_periods=1) - assert_almost_equal(b, np.ones(len(a))) + tm.assert_almost_equal(b, np.ones(len(a))) self.assertRaises(ValueError, mom.rolling_min, np.array([1, 2, 3]), window=3, min_periods=5) @@ -865,7 +962,7 @@ def test_rolling_max(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): a = np.array([1, 2, 3, 4, 5], dtype=np.float64) b = mom.rolling_max(a, window=100, min_periods=1) - assert_almost_equal(a, b) + tm.assert_almost_equal(a, b) self.assertRaises(ValueError, mom.rolling_max, np.array([1, 2, 3]), window=3, min_periods=5) @@ -901,7 +998,8 @@ def test_rolling_apply(self): category=RuntimeWarning) ser = Series([]) - assert_series_equal(ser, ser.rolling(10).apply(lambda x: x.mean())) + tm.assert_series_equal(ser, + ser.rolling(10).apply(lambda x: x.mean())) f = lambda x: x[np.isfinite(x)].mean() @@ -917,10 +1015,10 @@ def roll_mean(x, window, min_periods=None, freq=None, center=False, s = Series([None, None, None]) result = s.rolling(2, min_periods=0).apply(lambda x: len(x)) expected = Series([1., 2., 2.]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = s.rolling(2, min_periods=0).apply(len) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_rolling_apply_out_of_bounds(self): # #1850 @@ -933,7 +1031,7 @@ def test_rolling_apply_out_of_bounds(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = mom.rolling_apply(arr, 10, np.sum, min_periods=1) - assert_almost_equal(result, result) + tm.assert_almost_equal(result, result) def test_rolling_std(self): self._check_moment_func(mom.rolling_std, lambda x: np.std(x, ddof=1), @@ -946,13 +1044,13 @@ def test_rolling_std_1obs(self): result = mom.rolling_std(np.array([1., 2., 3., 4., 5.]), 1, min_periods=1) expected = np.array([np.nan] * 5) - assert_almost_equal(result, expected) + tm.assert_almost_equal(result, expected) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = mom.rolling_std(np.array([1., 2., 3., 4., 5.]), 1, min_periods=1, ddof=0) expected = np.zeros(5) - assert_almost_equal(result, expected) + tm.assert_almost_equal(result, expected) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = mom.rolling_std(np.array([np.nan, np.nan, 3., 4., 5.]), @@ -1066,7 +1164,7 @@ def get_result(arr, window, min_periods=None, center=False): kwargs) result = get_result(self.arr, window) - assert_almost_equal(result[-1], static_comp(self.arr[-50:])) + tm.assert_almost_equal(result[-1], static_comp(self.arr[-50:])) if preserve_nan: assert (np.isnan(result[self._nan_locs]).all()) @@ -1078,7 +1176,7 @@ def get_result(arr, window, min_periods=None, center=False): if has_min_periods: result = get_result(arr, 50, min_periods=30) - assert_almost_equal(result[-1], static_comp(arr[10:-10])) + tm.assert_almost_equal(result[-1], static_comp(arr[10:-10])) # min_periods is working correctly result = get_result(arr, 20, min_periods=15) @@ -1096,10 +1194,10 @@ def get_result(arr, window, min_periods=None, center=False): # min_periods=0 result0 = get_result(arr, 20, min_periods=0) result1 = get_result(arr, 20, min_periods=1) - assert_almost_equal(result0, result1) + tm.assert_almost_equal(result0, result1) else: result = get_result(arr, 50) - assert_almost_equal(result[-1], static_comp(arr[10:-10])) + tm.assert_almost_equal(result[-1], static_comp(arr[10:-10])) # GH 7925 if has_center: @@ -1117,7 +1215,8 @@ def get_result(arr, window, min_periods=None, center=False): if test_stable: result = get_result(self.arr + 1e9, window) - assert_almost_equal(result[-1], static_comp(self.arr[-50:] + 1e9)) + tm.assert_almost_equal(result[-1], + static_comp(self.arr[-50:] + 1e9)) # Test window larger than array, #7297 if test_window: @@ -1131,14 +1230,15 @@ def get_result(arr, window, min_periods=None, center=False): self.assertTrue(np.array_equal(nan_mask, np.isnan( expected))) nan_mask = ~nan_mask - assert_almost_equal(result[nan_mask], expected[nan_mask]) + tm.assert_almost_equal(result[nan_mask], + expected[nan_mask]) else: result = get_result(self.arr, len(self.arr) + 1) expected = get_result(self.arr, len(self.arr)) nan_mask = np.isnan(result) self.assertTrue(np.array_equal(nan_mask, np.isnan(expected))) nan_mask = ~nan_mask - assert_almost_equal(result[nan_mask], expected[nan_mask]) + tm.assert_almost_equal(result[nan_mask], expected[nan_mask]) def _check_structures(self, f, static_comp, name=None, has_min_periods=True, has_time_rule=True, @@ -1190,11 +1290,12 @@ def get_result(obj, window, min_periods=None, freq=None, center=False): trunc_series = self.series[::2].truncate(prev_date, last_date) trunc_frame = self.frame[::2].truncate(prev_date, last_date) - assert_almost_equal(series_result[-1], static_comp(trunc_series)) + self.assertAlmostEqual(series_result[-1], + static_comp(trunc_series)) - assert_series_equal(frame_result.xs(last_date), - trunc_frame.apply(static_comp), - check_names=False) + tm.assert_series_equal(frame_result.xs(last_date), + trunc_frame.apply(static_comp), + check_names=False) # GH 7925 if has_center: @@ -1233,8 +1334,8 @@ def get_result(obj, window, min_periods=None, freq=None, center=False): if fill_value is not None: series_xp = series_xp.fillna(fill_value) frame_xp = frame_xp.fillna(fill_value) - assert_series_equal(series_xp, series_rs) - assert_frame_equal(frame_xp, frame_rs) + tm.assert_series_equal(series_xp, series_rs) + tm.assert_frame_equal(frame_xp, frame_rs) def test_ewma(self): self._check_ew(mom.ewma, name='mean') @@ -1254,7 +1355,7 @@ def test_ewma(self): lambda s: s.ewm(com=2.0, adjust=True, ignore_na=True).mean(), ]: result = f(s) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) expected = Series([1.0, 1.333333, 2.222222, 4.148148]) for f in [lambda s: s.ewm(com=2.0, adjust=False).mean(), @@ -1264,7 +1365,7 @@ def test_ewma(self): ignore_na=True).mean(), ]: result = f(s) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_ewma_nan_handling(self): s = Series([1.] + [np.nan] * 5 + [1.]) @@ -1315,11 +1416,11 @@ def simple_wma(s, w): expected = simple_wma(s, Series(w)) result = s.ewm(com=com, adjust=adjust, ignore_na=ignore_na).mean() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) if ignore_na is False: # check that ignore_na defaults to False result = s.ewm(com=com, adjust=adjust).mean() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_ewmvar(self): self._check_ew(mom.ewmvar, name='var') @@ -1331,7 +1432,7 @@ def test_ewma_span_com_args(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): A = mom.ewma(self.arr, com=9.5) B = mom.ewma(self.arr, span=20) - assert_almost_equal(A, B) + tm.assert_almost_equal(A, B) self.assertRaises(ValueError, mom.ewma, self.arr, com=9.5, span=20) self.assertRaises(ValueError, mom.ewma, self.arr) @@ -1340,7 +1441,7 @@ def test_ewma_halflife_arg(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): A = mom.ewma(self.arr, com=13.932726172912965) B = mom.ewma(self.arr, halflife=10.0) - assert_almost_equal(A, B) + tm.assert_almost_equal(A, B) self.assertRaises(ValueError, mom.ewma, self.arr, span=20, halflife=50) @@ -1357,9 +1458,9 @@ def test_ewma_alpha_old_api(self): b = mom.ewma(self.arr, com=0.62014947789973052) c = mom.ewma(self.arr, span=2.240298955799461) d = mom.ewma(self.arr, halflife=0.721792864318) - assert_numpy_array_equal(a, b) - assert_numpy_array_equal(a, c) - assert_numpy_array_equal(a, d) + tm.assert_numpy_array_equal(a, b) + tm.assert_numpy_array_equal(a, c) + tm.assert_numpy_array_equal(a, d) def test_ewma_alpha_arg_old_api(self): # GH 10789 @@ -1379,9 +1480,9 @@ def test_ewm_alpha(self): b = s.ewm(com=0.62014947789973052).mean() c = s.ewm(span=2.240298955799461).mean() d = s.ewm(halflife=0.721792864318).mean() - assert_series_equal(a, b) - assert_series_equal(a, c) - assert_series_equal(a, d) + tm.assert_series_equal(a, b) + tm.assert_series_equal(a, c) + tm.assert_series_equal(a, d) def test_ewm_alpha_arg(self): # GH 10789 @@ -1423,7 +1524,7 @@ def test_ew_empty_arrays(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = f(arr, 3) - assert_almost_equal(result, arr) + tm.assert_almost_equal(result, arr) def _check_ew(self, func, name=None): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): @@ -1460,16 +1561,16 @@ def _check_ew_ndarray(self, func, preserve_nan=False, name=None): # check series of length 0 result = func(Series([]), 50, min_periods=min_periods) - assert_series_equal(result, Series([])) + tm.assert_series_equal(result, Series([])) # check series of length 1 result = func(Series([1.]), 50, min_periods=min_periods) if func == mom.ewma: - assert_series_equal(result, Series([1.])) + tm.assert_series_equal(result, Series([1.])) else: # ewmstd, ewmvol, ewmvar with bias=False require at least two # values - assert_series_equal(result, Series([np.NaN])) + tm.assert_series_equal(result, Series([np.NaN])) # pass in ints result2 = func(np.arange(50), span=10) @@ -1601,8 +1702,6 @@ def _non_null_values(x): return set(values[notnull(values)].tolist()) for (x, is_constant, no_nans) in self.data: - assert_equal = assert_series_equal if isinstance( - x, Series) else assert_frame_equal count_x = count(x) mean_x = mean(x) @@ -1707,7 +1806,7 @@ def _non_null_values(x): assert_equal(cov_x_y, mean_x_times_y - (mean_x * mean_y)) - @slow + @tm.slow def test_ewm_consistency(self): def _weights(s, com, adjust, ignore_na): if isinstance(s, DataFrame): @@ -1806,7 +1905,7 @@ def _ewma(s, com, min_periods, adjust, ignore_na): _variance_debiasing_factors(x, com=com, adjust=adjust, ignore_na=ignore_na))) - @slow + @tm.slow def test_expanding_consistency(self): # suppress warnings about empty slices, as we are deliberately testing @@ -1849,8 +1948,6 @@ def test_expanding_consistency(self): # expanding_apply of Series.xyz(), or (b) expanding_apply of # np.nanxyz() for (x, is_constant, no_nans) in self.data: - assert_equal = assert_series_equal if isinstance( - x, Series) else assert_frame_equal functions = self.base_functions # GH 8269 @@ -1895,9 +1992,9 @@ def test_expanding_consistency(self): x.iloc[:, i].expanding( min_periods=min_periods), name)(x.iloc[:, j]) - assert_panel_equal(expanding_f_result, expected) + tm.assert_panel_equal(expanding_f_result, expected) - @slow + @tm.slow def test_rolling_consistency(self): # suppress warnings about empty slices, as we are deliberately testing @@ -1969,10 +2066,6 @@ def cases(): # rolling_apply of Series.xyz(), or (b) rolling_apply of # np.nanxyz() for (x, is_constant, no_nans) in self.data: - - assert_equal = (assert_series_equal - if isinstance(x, Series) else - assert_frame_equal) functions = self.base_functions # GH 8269 @@ -2023,7 +2116,7 @@ def cases(): min_periods=min_periods, center=center), name)(x.iloc[:, j])) - assert_panel_equal(rolling_f_result, expected) + tm.assert_panel_equal(rolling_f_result, expected) # binary moments def test_rolling_cov(self): @@ -2031,7 +2124,7 @@ def test_rolling_cov(self): B = A + randn(len(A)) result = A.rolling(window=50, min_periods=25).cov(B) - assert_almost_equal(result[-1], np.cov(A[-50:], B[-50:])[0, 1]) + tm.assert_almost_equal(result[-1], np.cov(A[-50:], B[-50:])[0, 1]) def test_rolling_cov_pairwise(self): self._check_pairwise_moment('rolling', 'cov', window=10, min_periods=5) @@ -2041,7 +2134,7 @@ def test_rolling_corr(self): B = A + randn(len(A)) result = A.rolling(window=50, min_periods=25).corr(B) - assert_almost_equal(result[-1], np.corrcoef(A[-50:], B[-50:])[0, 1]) + tm.assert_almost_equal(result[-1], np.corrcoef(A[-50:], B[-50:])[0, 1]) # test for correct bias correction a = tm.makeTimeSeries() @@ -2050,7 +2143,7 @@ def test_rolling_corr(self): b[:10] = np.nan result = a.rolling(window=len(a), min_periods=1).corr(b) - assert_almost_equal(result[-1], a.corr(b)) + tm.assert_almost_equal(result[-1], a.corr(b)) def test_rolling_corr_pairwise(self): self._check_pairwise_moment('rolling', 'corr', window=10, @@ -2151,18 +2244,18 @@ def func(A, B, com, **kwargs): # check series of length 0 result = func(Series([]), Series([]), 50, min_periods=min_periods) - assert_series_equal(result, Series([])) + tm.assert_series_equal(result, Series([])) # check series of length 1 result = func( Series([1.]), Series([1.]), 50, min_periods=min_periods) - assert_series_equal(result, Series([np.NaN])) + tm.assert_series_equal(result, Series([np.NaN])) self.assertRaises(Exception, func, A, randn(50), 20, min_periods=5) def test_expanding_apply(self): ser = Series([]) - assert_series_equal(ser, ser.expanding().apply(lambda x: x.mean())) + tm.assert_series_equal(ser, ser.expanding().apply(lambda x: x.mean())) def expanding_mean(x, min_periods=1, freq=None): return mom.expanding_apply(x, lambda x: x.mean(), @@ -2174,7 +2267,7 @@ def expanding_mean(x, min_periods=1, freq=None): s = Series([None, None, None]) result = s.expanding(min_periods=0).apply(lambda x: len(x)) expected = Series([1., 2., 3.]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_expanding_apply_args_kwargs(self): def mean_w_arg(x, const): @@ -2184,11 +2277,11 @@ def mean_w_arg(x, const): expected = df.expanding().apply(np.mean) + 20. - assert_frame_equal(df.expanding().apply(mean_w_arg, args=(20, )), - expected) - assert_frame_equal(df.expanding().apply(mean_w_arg, - kwargs={'const': 20}), - expected) + tm.assert_frame_equal(df.expanding().apply(mean_w_arg, args=(20, )), + expected) + tm.assert_frame_equal(df.expanding().apply(mean_w_arg, + kwargs={'const': 20}), + expected) def test_expanding_corr(self): A = self.series.dropna() @@ -2198,11 +2291,11 @@ def test_expanding_corr(self): rolling_result = A.rolling(window=len(A), min_periods=1).corr(B) - assert_almost_equal(rolling_result, result) + tm.assert_almost_equal(rolling_result, result) def test_expanding_count(self): result = self.series.expanding().count() - assert_almost_equal(result, self.series.rolling( + tm.assert_almost_equal(result, self.series.rolling( window=len(self.series)).count()) def test_expanding_quantile(self): @@ -2211,7 +2304,7 @@ def test_expanding_quantile(self): rolling_result = self.series.rolling(window=len(self.series), min_periods=1).quantile(0.5) - assert_almost_equal(result, rolling_result) + tm.assert_almost_equal(result, rolling_result) def test_expanding_cov(self): A = self.series @@ -2221,7 +2314,7 @@ def test_expanding_cov(self): rolling_result = A.rolling(window=len(A), min_periods=1).cov(B) - assert_almost_equal(rolling_result, result) + tm.assert_almost_equal(rolling_result, result) def test_expanding_max(self): self._check_expanding(mom.expanding_max, np.max, preserve_nan=False) @@ -2233,7 +2326,7 @@ def test_expanding_cov_pairwise(self): min_periods=1).corr() for i in result.items: - assert_almost_equal(result[i], rolling_result[i]) + tm.assert_almost_equal(result[i], rolling_result[i]) def test_expanding_corr_pairwise(self): result = self.frame.expanding().corr() @@ -2242,7 +2335,7 @@ def test_expanding_corr_pairwise(self): min_periods=1).corr() for i in result.items: - assert_almost_equal(result[i], rolling_result[i]) + tm.assert_almost_equal(result[i], rolling_result[i]) def test_expanding_cov_diff_index(self): # GH 7512 @@ -2250,17 +2343,17 @@ def test_expanding_cov_diff_index(self): s2 = Series([1, 3], index=[0, 2]) result = s1.expanding().cov(s2) expected = Series([None, None, 2.0]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) s2a = Series([1, None, 3], index=[0, 1, 2]) result = s1.expanding().cov(s2a) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) s1 = Series([7, 8, 10], index=[0, 1, 3]) s2 = Series([7, 9, 10], index=[0, 2, 3]) result = s1.expanding().cov(s2) expected = Series([None, None, None, 4.5]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_expanding_corr_diff_index(self): # GH 7512 @@ -2268,17 +2361,17 @@ def test_expanding_corr_diff_index(self): s2 = Series([1, 3], index=[0, 2]) result = s1.expanding().corr(s2) expected = Series([None, None, 1.0]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) s2a = Series([1, None, 3], index=[0, 1, 2]) result = s1.expanding().corr(s2a) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) s1 = Series([7, 8, 10], index=[0, 1, 3]) s2 = Series([7, 9, 10], index=[0, 2, 3]) result = s1.expanding().corr(s2) expected = Series([None, None, None, 1.]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_rolling_cov_diff_length(self): # GH 7512 @@ -2286,11 +2379,11 @@ def test_rolling_cov_diff_length(self): s2 = Series([1, 3], index=[0, 2]) result = s1.rolling(window=3, min_periods=2).cov(s2) expected = Series([None, None, 2.0]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) s2a = Series([1, None, 3], index=[0, 1, 2]) result = s1.rolling(window=3, min_periods=2).cov(s2a) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_rolling_corr_diff_length(self): # GH 7512 @@ -2298,11 +2391,11 @@ def test_rolling_corr_diff_length(self): s2 = Series([1, 3], index=[0, 2]) result = s1.rolling(window=3, min_periods=2).corr(s2) expected = Series([None, None, 1.0]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) s2a = Series([1, None, 3], index=[0, 1, 2]) result = s1.rolling(window=3, min_periods=2).corr(s2a) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_rolling_functions_window_non_shrinkage(self): # GH 7764 @@ -2334,10 +2427,10 @@ def test_rolling_functions_window_non_shrinkage(self): for f in functions: try: s_result = f(s) - assert_series_equal(s_result, s_expected) + tm.assert_series_equal(s_result, s_expected) df_result = f(df) - assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_result, df_expected) except (ImportError): # scipy needed for rolling_window @@ -2349,7 +2442,7 @@ def test_rolling_functions_window_non_shrinkage(self): .corr(x, pairwise=True))] for f in functions: df_result_panel = f(df) - assert_panel_equal(df_result_panel, df_expected_panel) + tm.assert_panel_equal(df_result_panel, df_expected_panel) def test_moment_functions_zero_length(self): # GH 8056 @@ -2404,13 +2497,13 @@ def test_moment_functions_zero_length(self): for f in functions: try: s_result = f(s) - assert_series_equal(s_result, s_expected) + tm.assert_series_equal(s_result, s_expected) df1_result = f(df1) - assert_frame_equal(df1_result, df1_expected) + tm.assert_frame_equal(df1_result, df1_expected) df2_result = f(df2) - assert_frame_equal(df2_result, df2_expected) + tm.assert_frame_equal(df2_result, df2_expected) except (ImportError): # scipy needed for rolling_window @@ -2427,10 +2520,10 @@ def test_moment_functions_zero_length(self): ] for f in functions: df1_result_panel = f(df1) - assert_panel_equal(df1_result_panel, df1_expected_panel) + tm.assert_panel_equal(df1_result_panel, df1_expected_panel) df2_result_panel = f(df2) - assert_panel_equal(df2_result_panel, df2_expected_panel) + tm.assert_panel_equal(df2_result_panel, df2_expected_panel) def test_expanding_cov_pairwise_diff_length(self): # GH 7512 @@ -2444,10 +2537,10 @@ def test_expanding_cov_pairwise_diff_length(self): result4 = df1a.expanding().cov(df2a, pairwise=True)[2] expected = DataFrame([[-3., -5.], [-6., -10.]], index=['A', 'B'], columns=['X', 'Y']) - assert_frame_equal(result1, expected) - assert_frame_equal(result2, expected) - assert_frame_equal(result3, expected) - assert_frame_equal(result4, expected) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + tm.assert_frame_equal(result3, expected) + tm.assert_frame_equal(result4, expected) def test_expanding_corr_pairwise_diff_length(self): # GH 7512 @@ -2461,35 +2554,29 @@ def test_expanding_corr_pairwise_diff_length(self): result4 = df1a.expanding().corr(df2a, pairwise=True)[2] expected = DataFrame([[-1.0, -1.0], [-1.0, -1.0]], index=['A', 'B'], columns=['X', 'Y']) - assert_frame_equal(result1, expected) - assert_frame_equal(result2, expected) - assert_frame_equal(result3, expected) - assert_frame_equal(result4, expected) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + tm.assert_frame_equal(result3, expected) + tm.assert_frame_equal(result4, expected) def test_pairwise_stats_column_names_order(self): # GH 7738 df1s = [DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0, 1]), - DataFrame( - [[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 0]), - DataFrame( - [[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 1]), - DataFrame( - [[2, 4], [1, 2], [5, 2], [8, 1]], columns=['C', 'C']), - DataFrame( - [[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1., 0]), - DataFrame( - [[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0., 1]), - DataFrame( - [[2, 4], [1, 2], [5, 2], [8, 1]], columns=['C', 1]), - DataFrame( - [[2., 4.], [1., 2.], [5., 2.], [8., 1.]], columns=[1, 0.]), - DataFrame( - [[2, 4.], [1, 2.], [5, 2.], [8, 1.]], columns=[0, 1.]), - DataFrame( - [[2, 4], [1, 2], [5, 2], [8, 1.]], columns=[1., 'X']), ] - df2 = DataFrame( - [[None, 1, 1], [None, 1, 2], [None, 3, 2], [None, 8, 1] - ], columns=['Y', 'Z', 'X']) + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 0]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 1]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], + columns=['C', 'C']), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1., 0]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0., 1]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=['C', 1]), + DataFrame([[2., 4.], [1., 2.], [5., 2.], [8., 1.]], + columns=[1, 0.]), + DataFrame([[2, 4.], [1, 2.], [5, 2.], [8, 1.]], + columns=[0, 1.]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1.]], + columns=[1., 'X']), ] + df2 = DataFrame([[None, 1, 1], [None, 1, 2], + [None, 3, 2], [None, 8, 1]], columns=['Y', 'Z', 'X']) s = Series([1, 1, 3, 8]) # suppress warnings about incomparable objects, as we are deliberately @@ -2503,11 +2590,13 @@ def test_pairwise_stats_column_names_order(self): for f in [lambda x: x.cov(), lambda x: x.corr(), ]: results = [f(df) for df in df1s] for (df, result) in zip(df1s, results): - assert_index_equal(result.index, df.columns) - assert_index_equal(result.columns, df.columns) + tm.assert_index_equal(result.index, df.columns) + tm.assert_index_equal(result.columns, df.columns) for i, result in enumerate(results): if i > 0: - self.assert_numpy_array_equal(result, results[0]) + # compare internal values, as columns can be different + self.assert_numpy_array_equal(result.values, + results[0].values) # DataFrame with itself, pairwise=True for f in [lambda x: x.expanding().cov(pairwise=True), @@ -2518,12 +2607,13 @@ def test_pairwise_stats_column_names_order(self): lambda x: x.ewm(com=3).corr(pairwise=True), ]: results = [f(df) for df in df1s] for (df, result) in zip(df1s, results): - assert_index_equal(result.items, df.index) - assert_index_equal(result.major_axis, df.columns) - assert_index_equal(result.minor_axis, df.columns) + tm.assert_index_equal(result.items, df.index) + tm.assert_index_equal(result.major_axis, df.columns) + tm.assert_index_equal(result.minor_axis, df.columns) for i, result in enumerate(results): if i > 0: - self.assert_numpy_array_equal(result, results[0]) + self.assert_numpy_array_equal(result.values, + results[0].values) # DataFrame with itself, pairwise=False for f in [lambda x: x.expanding().cov(pairwise=False), @@ -2534,11 +2624,12 @@ def test_pairwise_stats_column_names_order(self): lambda x: x.ewm(com=3).corr(pairwise=False), ]: results = [f(df) for df in df1s] for (df, result) in zip(df1s, results): - assert_index_equal(result.index, df.index) - assert_index_equal(result.columns, df.columns) + tm.assert_index_equal(result.index, df.index) + tm.assert_index_equal(result.columns, df.columns) for i, result in enumerate(results): if i > 0: - self.assert_numpy_array_equal(result, results[0]) + self.assert_numpy_array_equal(result.values, + results[0].values) # DataFrame with another DataFrame, pairwise=True for f in [lambda x, y: x.expanding().cov(y, pairwise=True), @@ -2549,12 +2640,13 @@ def test_pairwise_stats_column_names_order(self): lambda x, y: x.ewm(com=3).corr(y, pairwise=True), ]: results = [f(df, df2) for df in df1s] for (df, result) in zip(df1s, results): - assert_index_equal(result.items, df.index) - assert_index_equal(result.major_axis, df.columns) - assert_index_equal(result.minor_axis, df2.columns) + tm.assert_index_equal(result.items, df.index) + tm.assert_index_equal(result.major_axis, df.columns) + tm.assert_index_equal(result.minor_axis, df2.columns) for i, result in enumerate(results): if i > 0: - self.assert_numpy_array_equal(result, results[0]) + self.assert_numpy_array_equal(result.values, + results[0].values) # DataFrame with another DataFrame, pairwise=False for f in [lambda x, y: x.expanding().cov(y, pairwise=False), @@ -2569,8 +2661,8 @@ def test_pairwise_stats_column_names_order(self): if result is not None: expected_index = df.index.union(df2.index) expected_columns = df.columns.union(df2.columns) - assert_index_equal(result.index, expected_index) - assert_index_equal(result.columns, expected_columns) + tm.assert_index_equal(result.index, expected_index) + tm.assert_index_equal(result.columns, expected_columns) else: tm.assertRaisesRegexp( ValueError, "'arg1' columns are not unique", f, df, @@ -2588,11 +2680,12 @@ def test_pairwise_stats_column_names_order(self): lambda x, y: x.ewm(com=3).corr(y), ]: results = [f(df, s) for df in df1s] + [f(s, df) for df in df1s] for (df, result) in zip(df1s, results): - assert_index_equal(result.index, df.index) - assert_index_equal(result.columns, df.columns) + tm.assert_index_equal(result.index, df.index) + tm.assert_index_equal(result.columns, df.columns) for i, result in enumerate(results): if i > 0: - self.assert_numpy_array_equal(result, results[0]) + self.assert_numpy_array_equal(result.values, + results[0].values) def test_rolling_skew_edge_cases(self): @@ -2601,19 +2694,19 @@ def test_rolling_skew_edge_cases(self): # yields all NaN (0 variance) d = Series([1] * 5) x = d.rolling(window=5).skew() - assert_series_equal(all_nan, x) + tm.assert_series_equal(all_nan, x) # yields all NaN (window too small) d = Series(np.random.randn(5)) x = d.rolling(window=2).skew() - assert_series_equal(all_nan, x) + tm.assert_series_equal(all_nan, x) # yields [NaN, NaN, NaN, 0.177994, 1.548824] d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401 ]) expected = Series([np.NaN, np.NaN, np.NaN, 0.177994, 1.548824]) x = d.rolling(window=4).skew() - assert_series_equal(expected, x) + tm.assert_series_equal(expected, x) def test_rolling_kurt_edge_cases(self): @@ -2622,25 +2715,25 @@ def test_rolling_kurt_edge_cases(self): # yields all NaN (0 variance) d = Series([1] * 5) x = d.rolling(window=5).kurt() - assert_series_equal(all_nan, x) + tm.assert_series_equal(all_nan, x) # yields all NaN (window too small) d = Series(np.random.randn(5)) x = d.rolling(window=3).kurt() - assert_series_equal(all_nan, x) + tm.assert_series_equal(all_nan, x) # yields [NaN, NaN, NaN, 1.224307, 2.671499] d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401 ]) expected = Series([np.NaN, np.NaN, np.NaN, 1.224307, 2.671499]) x = d.rolling(window=4).kurt() - assert_series_equal(expected, x) + tm.assert_series_equal(expected, x) def _check_expanding_ndarray(self, func, static_comp, has_min_periods=True, has_time_rule=True, preserve_nan=True): result = func(self.arr) - assert_almost_equal(result[10], static_comp(self.arr[:11])) + tm.assert_almost_equal(result[10], static_comp(self.arr[:11])) if preserve_nan: assert (np.isnan(result[self._nan_locs]).all()) @@ -2650,7 +2743,7 @@ def _check_expanding_ndarray(self, func, static_comp, has_min_periods=True, if has_min_periods: result = func(arr, min_periods=30) assert (np.isnan(result[:29]).all()) - assert_almost_equal(result[-1], static_comp(arr[:50])) + tm.assert_almost_equal(result[-1], static_comp(arr[:50])) # min_periods is working correctly result = func(arr, min_periods=15) @@ -2665,10 +2758,10 @@ def _check_expanding_ndarray(self, func, static_comp, has_min_periods=True, # min_periods=0 result0 = func(arr, min_periods=0) result1 = func(arr, min_periods=1) - assert_almost_equal(result0, result1) + tm.assert_almost_equal(result0, result1) else: result = func(arr) - assert_almost_equal(result[-1], static_comp(arr[:50])) + tm.assert_almost_equal(result[-1], static_comp(arr[:50])) def _check_expanding_structures(self, func): series_result = func(self.series) @@ -2702,7 +2795,7 @@ def test_rolling_max_gh6297(self): index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): x = series.rolling(window=1, freq='D').max() - assert_series_equal(expected, x) + tm.assert_series_equal(expected, x) def test_rolling_max_how_resample(self): @@ -2721,14 +2814,14 @@ def test_rolling_max_how_resample(self): index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): x = series.rolling(window=1, freq='D').max() - assert_series_equal(expected, x) + tm.assert_series_equal(expected, x) # Now specify median (10.0) expected = Series([0.0, 1.0, 2.0, 3.0, 10.0], index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): x = series.rolling(window=1, freq='D').max(how='median') - assert_series_equal(expected, x) + tm.assert_series_equal(expected, x) # Now specify mean (4+10+20)/3 v = (4.0 + 10.0 + 20.0) / 3.0 @@ -2736,7 +2829,7 @@ def test_rolling_max_how_resample(self): index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): x = series.rolling(window=1, freq='D').max(how='mean') - assert_series_equal(expected, x) + tm.assert_series_equal(expected, x) def test_rolling_min_how_resample(self): @@ -2755,7 +2848,7 @@ def test_rolling_min_how_resample(self): index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): r = series.rolling(window=1, freq='D') - assert_series_equal(expected, r.min()) + tm.assert_series_equal(expected, r.min()) def test_rolling_median_how_resample(self): @@ -2774,7 +2867,7 @@ def test_rolling_median_how_resample(self): index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): x = series.rolling(window=1, freq='D').median() - assert_series_equal(expected, x) + tm.assert_series_equal(expected, x) def test_rolling_median_memory_error(self): # GH11722 @@ -2824,16 +2917,30 @@ def test_getitem(self): expected = g_mutated.B.apply(lambda x: x.rolling(2).mean()) result = g.rolling(2).mean().B - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = g.rolling(2).B.mean() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = g.B.rolling(2).mean() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = self.frame.B.groupby(self.frame.A).rolling(2).mean() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) + + def test_getitem_multiple(self): + + # GH 13174 + g = self.frame.groupby('A') + r = g.rolling(2) + g_mutated = self.frame.groupby('A', mutated=True) + expected = g_mutated.B.apply(lambda x: x.rolling(2).count()) + + result = r.B.count() + tm.assert_series_equal(result, expected) + + result = r.B.count() + tm.assert_series_equal(result, expected) def test_rolling(self): g = self.frame.groupby('A') @@ -2842,16 +2949,16 @@ def test_rolling(self): for f in ['sum', 'mean', 'min', 'max', 'count', 'kurt', 'skew']: result = getattr(r, f)() expected = g.apply(lambda x: getattr(x.rolling(4), f)()) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) for f in ['std', 'var']: result = getattr(r, f)(ddof=1) expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = r.quantile(0.5) expected = g.apply(lambda x: x.rolling(4).quantile(0.5)) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_rolling_corr_cov(self): g = self.frame.groupby('A') @@ -2863,14 +2970,14 @@ def test_rolling_corr_cov(self): def func(x): return getattr(x.rolling(4), f)(self.frame) expected = g.apply(func) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = getattr(r.B, f)(pairwise=True) def func(x): return getattr(x.B.rolling(4), f)(pairwise=True) expected = g.apply(func) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_rolling_apply(self): g = self.frame.groupby('A') @@ -2879,7 +2986,7 @@ def test_rolling_apply(self): # reduction result = r.apply(lambda x: x.sum()) expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum())) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_expanding(self): g = self.frame.groupby('A') @@ -2888,16 +2995,16 @@ def test_expanding(self): for f in ['sum', 'mean', 'min', 'max', 'count', 'kurt', 'skew']: result = getattr(r, f)() expected = g.apply(lambda x: getattr(x.expanding(), f)()) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) for f in ['std', 'var']: result = getattr(r, f)(ddof=0) expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = r.quantile(0.5) expected = g.apply(lambda x: x.expanding().quantile(0.5)) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_expanding_corr_cov(self): g = self.frame.groupby('A') @@ -2909,14 +3016,14 @@ def test_expanding_corr_cov(self): def func(x): return getattr(x.expanding(), f)(self.frame) expected = g.apply(func) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = getattr(r.B, f)(pairwise=True) def func(x): return getattr(x.B.expanding(), f)(pairwise=True) expected = g.apply(func) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_expanding_apply(self): g = self.frame.groupby('A') @@ -2925,4 +3032,4 @@ def test_expanding_apply(self): # reduction result = r.apply(lambda x: x.sum()) expected = g.apply(lambda x: x.expanding().apply(lambda y: y.sum())) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/types/test_dtypes.py b/pandas/tests/types/test_dtypes.py index 2a9ad30a07805..d48b9baf64777 100644 --- a/pandas/tests/types/test_dtypes.py +++ b/pandas/tests/types/test_dtypes.py @@ -45,6 +45,16 @@ class TestCategoricalDtype(Base, tm.TestCase): def setUp(self): self.dtype = CategoricalDtype() + def test_hash_vs_equality(self): + # make sure that we satisfy is semantics + dtype = self.dtype + dtype2 = CategoricalDtype() + self.assertTrue(dtype == dtype2) + self.assertTrue(dtype2 == dtype) + self.assertTrue(dtype is dtype2) + self.assertTrue(dtype2 is dtype) + self.assertTrue(hash(dtype) == hash(dtype2)) + def test_equality(self): self.assertTrue(is_dtype_equal(self.dtype, 'category')) self.assertTrue(is_dtype_equal(self.dtype, CategoricalDtype())) @@ -88,6 +98,20 @@ class TestDatetimeTZDtype(Base, tm.TestCase): def setUp(self): self.dtype = DatetimeTZDtype('ns', 'US/Eastern') + def test_hash_vs_equality(self): + # make sure that we satisfy is semantics + dtype = self.dtype + dtype2 = DatetimeTZDtype('ns', 'US/Eastern') + dtype3 = DatetimeTZDtype(dtype2) + self.assertTrue(dtype == dtype2) + self.assertTrue(dtype2 == dtype) + self.assertTrue(dtype3 == dtype) + self.assertTrue(dtype is dtype2) + self.assertTrue(dtype2 is dtype) + self.assertTrue(dtype3 is dtype) + self.assertTrue(hash(dtype) == hash(dtype2)) + self.assertTrue(hash(dtype) == hash(dtype3)) + def test_construction(self): self.assertRaises(ValueError, lambda: DatetimeTZDtype('ms', 'US/Eastern')) diff --git a/pandas/tests/types/test_types.py b/pandas/tests/types/test_types.py new file mode 100644 index 0000000000000..b9f6006cab731 --- /dev/null +++ b/pandas/tests/types/test_types.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +import nose +import numpy as np + +from pandas import NaT +from pandas.types.api import (DatetimeTZDtype, CategoricalDtype, + na_value_for_dtype, pandas_dtype) + + +def test_pandas_dtype(): + + assert pandas_dtype('datetime64[ns, US/Eastern]') == DatetimeTZDtype( + 'datetime64[ns, US/Eastern]') + assert pandas_dtype('category') == CategoricalDtype() + for dtype in ['M8[ns]', 'm8[ns]', 'object', 'float64', 'int64']: + assert pandas_dtype(dtype) == np.dtype(dtype) + + +def test_na_value_for_dtype(): + for dtype in [np.dtype('M8[ns]'), np.dtype('m8[ns]'), + DatetimeTZDtype('datetime64[ns, US/Eastern]')]: + assert na_value_for_dtype(dtype) is NaT + + for dtype in ['u1', 'u2', 'u4', 'u8', + 'i1', 'i2', 'i4', 'i8']: + assert na_value_for_dtype(np.dtype(dtype)) == 0 + + for dtype in ['bool']: + assert na_value_for_dtype(np.dtype(dtype)) is False + + for dtype in ['f2', 'f4', 'f8']: + assert np.isnan(na_value_for_dtype(np.dtype(dtype))) + + for dtype in ['O']: + assert np.isnan(na_value_for_dtype(np.dtype(dtype))) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 3371f63db1e1c..182c0637ae29c 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -7,6 +7,7 @@ import numpy as np from pandas.compat import range, lrange, lzip, zip, map, filter import pandas.compat as compat + from pandas.core.categorical import Categorical from pandas.core.frame import DataFrame, _merge_doc from pandas.core.generic import NDFrame @@ -22,6 +23,7 @@ import pandas.core.algorithms as algos import pandas.core.common as com import pandas.types.concat as _concat +from pandas.types.api import na_value_for_dtype import pandas.algos as _algos import pandas.hashtable as _hash @@ -280,55 +282,78 @@ def _indicator_post_merge(self, result): return result def _maybe_add_join_keys(self, result, left_indexer, right_indexer): - # insert group keys + + left_has_missing = None + right_has_missing = None keys = zip(self.join_names, self.left_on, self.right_on) for i, (name, lname, rname) in enumerate(keys): if not _should_fill(lname, rname): continue + take_left, take_right = None, None + if name in result: - key_indexer = result.columns.get_loc(name) if left_indexer is not None and right_indexer is not None: - if name in self.left: - if len(self.left) == 0: - continue - na_indexer = (left_indexer == -1).nonzero()[0] - if len(na_indexer) == 0: - continue + if left_has_missing is None: + left_has_missing = any(left_indexer == -1) + + if left_has_missing: + take_right = self.right_join_keys[i] + + if not com.is_dtype_equal(result[name].dtype, + self.left[name].dtype): + take_left = self.left[name]._values - right_na_indexer = right_indexer.take(na_indexer) - result.iloc[na_indexer, key_indexer] = ( - algos.take_1d(self.right_join_keys[i], - right_na_indexer)) elif name in self.right: - if len(self.right) == 0: - continue - na_indexer = (right_indexer == -1).nonzero()[0] - if len(na_indexer) == 0: - continue + if right_has_missing is None: + right_has_missing = any(right_indexer == -1) + + if right_has_missing: + take_left = self.left_join_keys[i] + + if not com.is_dtype_equal(result[name].dtype, + self.right[name].dtype): + take_right = self.right[name]._values - left_na_indexer = left_indexer.take(na_indexer) - result.iloc[na_indexer, key_indexer] = ( - algos.take_1d(self.left_join_keys[i], - left_na_indexer)) elif left_indexer is not None \ and isinstance(self.left_join_keys[i], np.ndarray): - if name is None: - name = 'key_%d' % i + take_left = self.left_join_keys[i] + take_right = self.right_join_keys[i] - # a faster way? - key_col = algos.take_1d(self.left_join_keys[i], left_indexer) - na_indexer = (left_indexer == -1).nonzero()[0] - right_na_indexer = right_indexer.take(na_indexer) - key_col.put(na_indexer, algos.take_1d(self.right_join_keys[i], - right_na_indexer)) - result.insert(i, name, key_col) + if take_left is not None or take_right is not None: + + if take_left is None: + lvals = result[name]._values + else: + lfill = na_value_for_dtype(take_left.dtype) + lvals = algos.take_1d(take_left, left_indexer, + fill_value=lfill) + + if take_right is None: + rvals = result[name]._values + else: + rfill = na_value_for_dtype(take_right.dtype) + rvals = algos.take_1d(take_right, right_indexer, + fill_value=rfill) + + # if we have an all missing left_indexer + # make sure to just use the right values + mask = left_indexer == -1 + if mask.all(): + key_col = rvals + else: + key_col = Index(lvals).where(~mask, rvals) + + if name in result: + result[name] = key_col + else: + result.insert(i, name or 'key_%d' % i, key_col) def _get_join_info(self): left_ax = self.left._data.axes[self.axis] diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index de79e54e22270..a4e6cc404a457 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -410,7 +410,11 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, Notes ----- Any Series passed will have their name attributes used unless row or column - names for the cross-tabulation are specified + names for the cross-tabulation are specified. + + Any input passed containing Categorical data will have **all** of its + categories included in the cross-tabulation, even if the actual data does + not contain any instances of a particular category. In the event that there aren't overlapping indexes an empty DataFrame will be returned. @@ -434,6 +438,16 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, bar 1 2 1 0 foo 2 2 1 2 + >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) + >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f']) + >>> crosstab(foo, bar) # 'c' and 'f' are not represented in the data, + # but they still will be counted in the output + col_0 d e f + row_0 + a 1 0 0 + b 0 1 0 + c 0 0 0 + Returns ------- crosstab : DataFrame diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 808c9d22c53c8..baca8045f0cc1 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -1331,6 +1331,10 @@ def _plot(cls, ax, x, y, style=None, is_errorbar=False, **kwds): x = x._mpl_repr() if is_errorbar: + if 'xerr' in kwds: + kwds['xerr'] = np.array(kwds.get('xerr')) + if 'yerr' in kwds: + kwds['yerr'] = np.array(kwds.get('yerr')) return ax.errorbar(x, y, **kwds) else: # prevent style kwarg from going to errorbar, where it is diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py new file mode 100644 index 0000000000000..9d9b0635e0f35 --- /dev/null +++ b/pandas/tools/tests/test_concat.py @@ -0,0 +1,1037 @@ +import nose + +import numpy as np +from numpy.random import randn + +from datetime import datetime +from pandas.compat import StringIO +import pandas as pd +from pandas import (DataFrame, concat, + read_csv, isnull, Series, date_range, + Index, Panel, MultiIndex, Timestamp, + DatetimeIndex) +from pandas.util import testing as tm +from pandas.util.testing import (assert_frame_equal, + makeCustomDataframe as mkdf, + assert_almost_equal) + + +class TestConcatenate(tm.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + self.frame = DataFrame(tm.getSeriesData()) + self.mixed_frame = self.frame.copy() + self.mixed_frame['foo'] = 'bar' + + def test_append(self): + begin_index = self.frame.index[:5] + end_index = self.frame.index[5:] + + begin_frame = self.frame.reindex(begin_index) + end_frame = self.frame.reindex(end_index) + + appended = begin_frame.append(end_frame) + assert_almost_equal(appended['A'], self.frame['A']) + + del end_frame['A'] + partial_appended = begin_frame.append(end_frame) + self.assertIn('A', partial_appended) + + partial_appended = end_frame.append(begin_frame) + self.assertIn('A', partial_appended) + + # mixed type handling + appended = self.mixed_frame[:5].append(self.mixed_frame[5:]) + assert_frame_equal(appended, self.mixed_frame) + + # what to test here + mixed_appended = self.mixed_frame[:5].append(self.frame[5:]) + mixed_appended2 = self.frame[:5].append(self.mixed_frame[5:]) + + # all equal except 'foo' column + assert_frame_equal( + mixed_appended.reindex(columns=['A', 'B', 'C', 'D']), + mixed_appended2.reindex(columns=['A', 'B', 'C', 'D'])) + + # append empty + empty = DataFrame({}) + + appended = self.frame.append(empty) + assert_frame_equal(self.frame, appended) + self.assertIsNot(appended, self.frame) + + appended = empty.append(self.frame) + assert_frame_equal(self.frame, appended) + self.assertIsNot(appended, self.frame) + + # overlap + self.assertRaises(ValueError, self.frame.append, self.frame, + verify_integrity=True) + + # new columns + # GH 6129 + df = DataFrame({'a': {'x': 1, 'y': 2}, 'b': {'x': 3, 'y': 4}}) + row = Series([5, 6, 7], index=['a', 'b', 'c'], name='z') + expected = DataFrame({'a': {'x': 1, 'y': 2, 'z': 5}, 'b': { + 'x': 3, 'y': 4, 'z': 6}, 'c': {'z': 7}}) + result = df.append(row) + assert_frame_equal(result, expected) + + def test_append_length0_frame(self): + df = DataFrame(columns=['A', 'B', 'C']) + df3 = DataFrame(index=[0, 1], columns=['A', 'B']) + df5 = df.append(df3) + + expected = DataFrame(index=[0, 1], columns=['A', 'B', 'C']) + assert_frame_equal(df5, expected) + + def test_append_records(self): + arr1 = np.zeros((2,), dtype=('i4,f4,a10')) + arr1[:] = [(1, 2., 'Hello'), (2, 3., "World")] + + arr2 = np.zeros((3,), dtype=('i4,f4,a10')) + arr2[:] = [(3, 4., 'foo'), + (5, 6., "bar"), + (7., 8., 'baz')] + + df1 = DataFrame(arr1) + df2 = DataFrame(arr2) + + result = df1.append(df2, ignore_index=True) + expected = DataFrame(np.concatenate((arr1, arr2))) + assert_frame_equal(result, expected) + + def test_append_different_columns(self): + df = DataFrame({'bools': np.random.randn(10) > 0, + 'ints': np.random.randint(0, 10, 10), + 'floats': np.random.randn(10), + 'strings': ['foo', 'bar'] * 5}) + + a = df[:5].ix[:, ['bools', 'ints', 'floats']] + b = df[5:].ix[:, ['strings', 'ints', 'floats']] + + appended = a.append(b) + self.assertTrue(isnull(appended['strings'][0:4]).all()) + self.assertTrue(isnull(appended['bools'][5:]).all()) + + def test_append_many(self): + chunks = [self.frame[:5], self.frame[5:10], + self.frame[10:15], self.frame[15:]] + + result = chunks[0].append(chunks[1:]) + tm.assert_frame_equal(result, self.frame) + + chunks[-1] = chunks[-1].copy() + chunks[-1]['foo'] = 'bar' + result = chunks[0].append(chunks[1:]) + tm.assert_frame_equal(result.ix[:, self.frame.columns], self.frame) + self.assertTrue((result['foo'][15:] == 'bar').all()) + self.assertTrue(result['foo'][:15].isnull().all()) + + def test_append_preserve_index_name(self): + # #980 + df1 = DataFrame(data=None, columns=['A', 'B', 'C']) + df1 = df1.set_index(['A']) + df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], + columns=['A', 'B', 'C']) + df2 = df2.set_index(['A']) + + result = df1.append(df2) + self.assertEqual(result.index.name, 'A') + + def test_join_many(self): + df = DataFrame(np.random.randn(10, 6), columns=list('abcdef')) + df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]] + + joined = df_list[0].join(df_list[1:]) + tm.assert_frame_equal(joined, df) + + df_list = [df[['a', 'b']][:-2], + df[['c', 'd']][2:], df[['e', 'f']][1:9]] + + def _check_diff_index(df_list, result, exp_index): + reindexed = [x.reindex(exp_index) for x in df_list] + expected = reindexed[0].join(reindexed[1:]) + tm.assert_frame_equal(result, expected) + + # different join types + joined = df_list[0].join(df_list[1:], how='outer') + _check_diff_index(df_list, joined, df.index) + + joined = df_list[0].join(df_list[1:]) + _check_diff_index(df_list, joined, df_list[0].index) + + joined = df_list[0].join(df_list[1:], how='inner') + _check_diff_index(df_list, joined, df.index[2:8]) + + self.assertRaises(ValueError, df_list[0].join, df_list[1:], on='a') + + def test_join_many_mixed(self): + df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D']) + df['key'] = ['foo', 'bar'] * 4 + df1 = df.ix[:, ['A', 'B']] + df2 = df.ix[:, ['C', 'D']] + df3 = df.ix[:, ['key']] + + result = df1.join([df2, df3]) + assert_frame_equal(result, df) + + def test_append_missing_column_proper_upcast(self): + df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8')}) + df2 = DataFrame({'B': np.array([True, False, True, False], + dtype=bool)}) + + appended = df1.append(df2, ignore_index=True) + self.assertEqual(appended['A'].dtype, 'f8') + self.assertEqual(appended['B'].dtype, 'O') + + def test_concat_copy(self): + + df = DataFrame(np.random.randn(4, 3)) + df2 = DataFrame(np.random.randint(0, 10, size=4).reshape(4, 1)) + df3 = DataFrame({5: 'foo'}, index=range(4)) + + # these are actual copies + result = concat([df, df2, df3], axis=1, copy=True) + for b in result._data.blocks: + self.assertIsNone(b.values.base) + + # these are the same + result = concat([df, df2, df3], axis=1, copy=False) + for b in result._data.blocks: + if b.is_float: + self.assertTrue( + b.values.base is df._data.blocks[0].values.base) + elif b.is_integer: + self.assertTrue( + b.values.base is df2._data.blocks[0].values.base) + elif b.is_object: + self.assertIsNotNone(b.values.base) + + # float block was consolidated + df4 = DataFrame(np.random.randn(4, 1)) + result = concat([df, df2, df3, df4], axis=1, copy=False) + for b in result._data.blocks: + if b.is_float: + self.assertIsNone(b.values.base) + elif b.is_integer: + self.assertTrue( + b.values.base is df2._data.blocks[0].values.base) + elif b.is_object: + self.assertIsNotNone(b.values.base) + + def test_concat_with_group_keys(self): + df = DataFrame(np.random.randn(4, 3)) + df2 = DataFrame(np.random.randn(4, 4)) + + # axis=0 + df = DataFrame(np.random.randn(3, 4)) + df2 = DataFrame(np.random.randn(4, 4)) + + result = concat([df, df2], keys=[0, 1]) + exp_index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1, 1], + [0, 1, 2, 0, 1, 2, 3]]) + expected = DataFrame(np.r_[df.values, df2.values], + index=exp_index) + tm.assert_frame_equal(result, expected) + + result = concat([df, df], keys=[0, 1]) + exp_index2 = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], + [0, 1, 2, 0, 1, 2]]) + expected = DataFrame(np.r_[df.values, df.values], + index=exp_index2) + tm.assert_frame_equal(result, expected) + + # axis=1 + df = DataFrame(np.random.randn(4, 3)) + df2 = DataFrame(np.random.randn(4, 4)) + + result = concat([df, df2], keys=[0, 1], axis=1) + expected = DataFrame(np.c_[df.values, df2.values], + columns=exp_index) + tm.assert_frame_equal(result, expected) + + result = concat([df, df], keys=[0, 1], axis=1) + expected = DataFrame(np.c_[df.values, df.values], + columns=exp_index2) + tm.assert_frame_equal(result, expected) + + def test_concat_keys_specific_levels(self): + df = DataFrame(np.random.randn(10, 4)) + pieces = [df.ix[:, [0, 1]], df.ix[:, [2]], df.ix[:, [3]]] + level = ['three', 'two', 'one', 'zero'] + result = concat(pieces, axis=1, keys=['one', 'two', 'three'], + levels=[level], + names=['group_key']) + + self.assert_index_equal(result.columns.levels[0], + Index(level, name='group_key')) + self.assertEqual(result.columns.names[0], 'group_key') + + def test_concat_dataframe_keys_bug(self): + t1 = DataFrame({ + 'value': Series([1, 2, 3], index=Index(['a', 'b', 'c'], + name='id'))}) + t2 = DataFrame({ + 'value': Series([7, 8], index=Index(['a', 'b'], name='id'))}) + + # it works + result = concat([t1, t2], axis=1, keys=['t1', 't2']) + self.assertEqual(list(result.columns), [('t1', 'value'), + ('t2', 'value')]) + + def test_concat_series_partial_columns_names(self): + # GH10698 + foo = Series([1, 2], name='foo') + bar = Series([1, 2]) + baz = Series([4, 5]) + + result = concat([foo, bar, baz], axis=1) + expected = DataFrame({'foo': [1, 2], 0: [1, 2], 1: [ + 4, 5]}, columns=['foo', 0, 1]) + tm.assert_frame_equal(result, expected) + + result = concat([foo, bar, baz], axis=1, keys=[ + 'red', 'blue', 'yellow']) + expected = DataFrame({'red': [1, 2], 'blue': [1, 2], 'yellow': [ + 4, 5]}, columns=['red', 'blue', 'yellow']) + tm.assert_frame_equal(result, expected) + + result = concat([foo, bar, baz], axis=1, ignore_index=True) + expected = DataFrame({0: [1, 2], 1: [1, 2], 2: [4, 5]}) + tm.assert_frame_equal(result, expected) + + def test_concat_dict(self): + frames = {'foo': DataFrame(np.random.randn(4, 3)), + 'bar': DataFrame(np.random.randn(4, 3)), + 'baz': DataFrame(np.random.randn(4, 3)), + 'qux': DataFrame(np.random.randn(4, 3))} + + sorted_keys = sorted(frames) + + result = concat(frames) + expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys) + tm.assert_frame_equal(result, expected) + + result = concat(frames, axis=1) + expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys, + axis=1) + tm.assert_frame_equal(result, expected) + + keys = ['baz', 'foo', 'bar'] + result = concat(frames, keys=keys) + expected = concat([frames[k] for k in keys], keys=keys) + tm.assert_frame_equal(result, expected) + + def test_concat_ignore_index(self): + frame1 = DataFrame({"test1": ["a", "b", "c"], + "test2": [1, 2, 3], + "test3": [4.5, 3.2, 1.2]}) + frame2 = DataFrame({"test3": [5.2, 2.2, 4.3]}) + frame1.index = Index(["x", "y", "z"]) + frame2.index = Index(["x", "y", "q"]) + + v1 = concat([frame1, frame2], axis=1, ignore_index=True) + + nan = np.nan + expected = DataFrame([[nan, nan, nan, 4.3], + ['a', 1, 4.5, 5.2], + ['b', 2, 3.2, 2.2], + ['c', 3, 1.2, nan]], + index=Index(["q", "x", "y", "z"])) + + tm.assert_frame_equal(v1, expected) + + def test_concat_multiindex_with_keys(self): + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + frame = DataFrame(np.random.randn(10, 3), index=index, + columns=Index(['A', 'B', 'C'], name='exp')) + result = concat([frame, frame], keys=[0, 1], names=['iteration']) + + self.assertEqual(result.index.names, ('iteration',) + index.names) + tm.assert_frame_equal(result.ix[0], frame) + tm.assert_frame_equal(result.ix[1], frame) + self.assertEqual(result.index.nlevels, 3) + + def test_concat_multiindex_with_tz(self): + # GH 6606 + df = DataFrame({'dt': [datetime(2014, 1, 1), + datetime(2014, 1, 2), + datetime(2014, 1, 3)], + 'b': ['A', 'B', 'C'], + 'c': [1, 2, 3], 'd': [4, 5, 6]}) + df['dt'] = df['dt'].apply(lambda d: Timestamp(d, tz='US/Pacific')) + df = df.set_index(['dt', 'b']) + + exp_idx1 = DatetimeIndex(['2014-01-01', '2014-01-02', + '2014-01-03'] * 2, + tz='US/Pacific', name='dt') + exp_idx2 = Index(['A', 'B', 'C'] * 2, name='b') + exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) + expected = DataFrame({'c': [1, 2, 3] * 2, 'd': [4, 5, 6] * 2}, + index=exp_idx, columns=['c', 'd']) + + result = concat([df, df]) + tm.assert_frame_equal(result, expected) + + def test_concat_keys_and_levels(self): + df = DataFrame(np.random.randn(1, 3)) + df2 = DataFrame(np.random.randn(1, 4)) + + levels = [['foo', 'baz'], ['one', 'two']] + names = ['first', 'second'] + result = concat([df, df2, df, df2], + keys=[('foo', 'one'), ('foo', 'two'), + ('baz', 'one'), ('baz', 'two')], + levels=levels, + names=names) + expected = concat([df, df2, df, df2]) + exp_index = MultiIndex(levels=levels + [[0]], + labels=[[0, 0, 1, 1], [0, 1, 0, 1], + [0, 0, 0, 0]], + names=names + [None]) + expected.index = exp_index + + assert_frame_equal(result, expected) + + # no names + + result = concat([df, df2, df, df2], + keys=[('foo', 'one'), ('foo', 'two'), + ('baz', 'one'), ('baz', 'two')], + levels=levels) + self.assertEqual(result.index.names, (None,) * 3) + + # no levels + result = concat([df, df2, df, df2], + keys=[('foo', 'one'), ('foo', 'two'), + ('baz', 'one'), ('baz', 'two')], + names=['first', 'second']) + self.assertEqual(result.index.names, ('first', 'second') + (None,)) + self.assert_index_equal(result.index.levels[0], + Index(['baz', 'foo'], name='first')) + + def test_concat_keys_levels_no_overlap(self): + # GH #1406 + df = DataFrame(np.random.randn(1, 3), index=['a']) + df2 = DataFrame(np.random.randn(1, 4), index=['b']) + + self.assertRaises(ValueError, concat, [df, df], + keys=['one', 'two'], levels=[['foo', 'bar', 'baz']]) + + self.assertRaises(ValueError, concat, [df, df2], + keys=['one', 'two'], levels=[['foo', 'bar', 'baz']]) + + def test_concat_rename_index(self): + a = DataFrame(np.random.rand(3, 3), + columns=list('ABC'), + index=Index(list('abc'), name='index_a')) + b = DataFrame(np.random.rand(3, 3), + columns=list('ABC'), + index=Index(list('abc'), name='index_b')) + + result = concat([a, b], keys=['key0', 'key1'], + names=['lvl0', 'lvl1']) + + exp = concat([a, b], keys=['key0', 'key1'], names=['lvl0']) + names = list(exp.index.names) + names[1] = 'lvl1' + exp.index.set_names(names, inplace=True) + + tm.assert_frame_equal(result, exp) + self.assertEqual(result.index.names, exp.index.names) + + def test_crossed_dtypes_weird_corner(self): + columns = ['A', 'B', 'C', 'D'] + df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='f8'), + 'B': np.array([1, 2, 3, 4], dtype='i8'), + 'C': np.array([1, 2, 3, 4], dtype='f8'), + 'D': np.array([1, 2, 3, 4], dtype='i8')}, + columns=columns) + + df2 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8'), + 'B': np.array([1, 2, 3, 4], dtype='f8'), + 'C': np.array([1, 2, 3, 4], dtype='i8'), + 'D': np.array([1, 2, 3, 4], dtype='f8')}, + columns=columns) + + appended = df1.append(df2, ignore_index=True) + expected = DataFrame(np.concatenate([df1.values, df2.values], axis=0), + columns=columns) + tm.assert_frame_equal(appended, expected) + + df = DataFrame(np.random.randn(1, 3), index=['a']) + df2 = DataFrame(np.random.randn(1, 4), index=['b']) + result = concat( + [df, df2], keys=['one', 'two'], names=['first', 'second']) + self.assertEqual(result.index.names, ('first', 'second')) + + def test_dups_index(self): + # GH 4771 + + # single dtypes + df = DataFrame(np.random.randint(0, 10, size=40).reshape( + 10, 4), columns=['A', 'A', 'C', 'C']) + + result = concat([df, df], axis=1) + assert_frame_equal(result.iloc[:, :4], df) + assert_frame_equal(result.iloc[:, 4:], df) + + result = concat([df, df], axis=0) + assert_frame_equal(result.iloc[:10], df) + assert_frame_equal(result.iloc[10:], df) + + # multi dtypes + df = concat([DataFrame(np.random.randn(10, 4), + columns=['A', 'A', 'B', 'B']), + DataFrame(np.random.randint(0, 10, size=20) + .reshape(10, 2), + columns=['A', 'C'])], + axis=1) + + result = concat([df, df], axis=1) + assert_frame_equal(result.iloc[:, :6], df) + assert_frame_equal(result.iloc[:, 6:], df) + + result = concat([df, df], axis=0) + assert_frame_equal(result.iloc[:10], df) + assert_frame_equal(result.iloc[10:], df) + + # append + result = df.iloc[0:8, :].append(df.iloc[8:]) + assert_frame_equal(result, df) + + result = df.iloc[0:8, :].append(df.iloc[8:9]).append(df.iloc[9:10]) + assert_frame_equal(result, df) + + expected = concat([df, df], axis=0) + result = df.append(df) + assert_frame_equal(result, expected) + + def test_with_mixed_tuples(self): + # 10697 + # columns have mixed tuples, so handle properly + df1 = DataFrame({u'A': 'foo', (u'B', 1): 'bar'}, index=range(2)) + df2 = DataFrame({u'B': 'foo', (u'B', 1): 'bar'}, index=range(2)) + + # it works + concat([df1, df2]) + + def test_join_dups(self): + + # joining dups + df = concat([DataFrame(np.random.randn(10, 4), + columns=['A', 'A', 'B', 'B']), + DataFrame(np.random.randint(0, 10, size=20) + .reshape(10, 2), + columns=['A', 'C'])], + axis=1) + + expected = concat([df, df], axis=1) + result = df.join(df, rsuffix='_2') + result.columns = expected.columns + assert_frame_equal(result, expected) + + # GH 4975, invalid join on dups + w = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) + x = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) + y = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) + z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) + + dta = x.merge(y, left_index=True, right_index=True).merge( + z, left_index=True, right_index=True, how="outer") + dta = dta.merge(w, left_index=True, right_index=True) + expected = concat([x, y, z, w], axis=1) + expected.columns = ['x_x', 'y_x', 'x_y', + 'y_y', 'x_x', 'y_x', 'x_y', 'y_y'] + assert_frame_equal(dta, expected) + + def test_handle_empty_objects(self): + df = DataFrame(np.random.randn(10, 4), columns=list('abcd')) + + baz = df[:5].copy() + baz['foo'] = 'bar' + empty = df[5:5] + + frames = [baz, empty, empty, df[5:]] + concatted = concat(frames, axis=0) + + expected = df.ix[:, ['a', 'b', 'c', 'd', 'foo']] + expected['foo'] = expected['foo'].astype('O') + expected.loc[0:4, 'foo'] = 'bar' + + tm.assert_frame_equal(concatted, expected) + + # empty as first element with time series + # GH3259 + df = DataFrame(dict(A=range(10000)), index=date_range( + '20130101', periods=10000, freq='s')) + empty = DataFrame() + result = concat([df, empty], axis=1) + assert_frame_equal(result, df) + result = concat([empty, df], axis=1) + assert_frame_equal(result, df) + + result = concat([df, empty]) + assert_frame_equal(result, df) + result = concat([empty, df]) + assert_frame_equal(result, df) + + def test_concat_mixed_objs(self): + + # concat mixed series/frames + # G2385 + + # axis 1 + index = date_range('01-Jan-2013', periods=10, freq='H') + arr = np.arange(10, dtype='int64') + s1 = Series(arr, index=index) + s2 = Series(arr, index=index) + df = DataFrame(arr.reshape(-1, 1), index=index) + + expected = DataFrame(np.repeat(arr, 2).reshape(-1, 2), + index=index, columns=[0, 0]) + result = concat([df, df], axis=1) + assert_frame_equal(result, expected) + + expected = DataFrame(np.repeat(arr, 2).reshape(-1, 2), + index=index, columns=[0, 1]) + result = concat([s1, s2], axis=1) + assert_frame_equal(result, expected) + + expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), + index=index, columns=[0, 1, 2]) + result = concat([s1, s2, s1], axis=1) + assert_frame_equal(result, expected) + + expected = DataFrame(np.repeat(arr, 5).reshape(-1, 5), + index=index, columns=[0, 0, 1, 2, 3]) + result = concat([s1, df, s2, s2, s1], axis=1) + assert_frame_equal(result, expected) + + # with names + s1.name = 'foo' + expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), + index=index, columns=['foo', 0, 0]) + result = concat([s1, df, s2], axis=1) + assert_frame_equal(result, expected) + + s2.name = 'bar' + expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), + index=index, columns=['foo', 0, 'bar']) + result = concat([s1, df, s2], axis=1) + assert_frame_equal(result, expected) + + # ignore index + expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), + index=index, columns=[0, 1, 2]) + result = concat([s1, df, s2], axis=1, ignore_index=True) + assert_frame_equal(result, expected) + + # axis 0 + expected = DataFrame(np.tile(arr, 3).reshape(-1, 1), + index=index.tolist() * 3, columns=[0]) + result = concat([s1, df, s2]) + assert_frame_equal(result, expected) + + expected = DataFrame(np.tile(arr, 3).reshape(-1, 1), columns=[0]) + result = concat([s1, df, s2], ignore_index=True) + assert_frame_equal(result, expected) + + # invalid concatente of mixed dims + panel = tm.makePanel() + self.assertRaises(ValueError, lambda: concat([panel, s1], axis=1)) + + def test_panel_join(self): + panel = tm.makePanel() + tm.add_nans(panel) + + p1 = panel.ix[:2, :10, :3] + p2 = panel.ix[2:, 5:, 2:] + + # left join + result = p1.join(p2) + expected = p1.copy() + expected['ItemC'] = p2['ItemC'] + tm.assert_panel_equal(result, expected) + + # right join + result = p1.join(p2, how='right') + expected = p2.copy() + expected['ItemA'] = p1['ItemA'] + expected['ItemB'] = p1['ItemB'] + expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC']) + tm.assert_panel_equal(result, expected) + + # inner join + result = p1.join(p2, how='inner') + expected = panel.ix[:, 5:10, 2:3] + tm.assert_panel_equal(result, expected) + + # outer join + result = p1.join(p2, how='outer') + expected = p1.reindex(major=panel.major_axis, + minor=panel.minor_axis) + expected = expected.join(p2.reindex(major=panel.major_axis, + minor=panel.minor_axis)) + tm.assert_panel_equal(result, expected) + + def test_panel_join_overlap(self): + panel = tm.makePanel() + tm.add_nans(panel) + + p1 = panel.ix[['ItemA', 'ItemB', 'ItemC']] + p2 = panel.ix[['ItemB', 'ItemC']] + + # Expected index is + # + # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2 + joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2') + p1_suf = p1.ix[['ItemB', 'ItemC']].add_suffix('_p1') + p2_suf = p2.ix[['ItemB', 'ItemC']].add_suffix('_p2') + no_overlap = panel.ix[['ItemA']] + expected = no_overlap.join(p1_suf.join(p2_suf)) + tm.assert_panel_equal(joined, expected) + + def test_panel_join_many(self): + tm.K = 10 + panel = tm.makePanel() + tm.K = 4 + + panels = [panel.ix[:2], panel.ix[2:6], panel.ix[6:]] + + joined = panels[0].join(panels[1:]) + tm.assert_panel_equal(joined, panel) + + panels = [panel.ix[:2, :-5], panel.ix[2:6, 2:], panel.ix[6:, 5:-7]] + + data_dict = {} + for p in panels: + data_dict.update(p.iteritems()) + + joined = panels[0].join(panels[1:], how='inner') + expected = Panel.from_dict(data_dict, intersect=True) + tm.assert_panel_equal(joined, expected) + + joined = panels[0].join(panels[1:], how='outer') + expected = Panel.from_dict(data_dict, intersect=False) + tm.assert_panel_equal(joined, expected) + + # edge cases + self.assertRaises(ValueError, panels[0].join, panels[1:], + how='outer', lsuffix='foo', rsuffix='bar') + self.assertRaises(ValueError, panels[0].join, panels[1:], + how='right') + + def test_panel_concat_other_axes(self): + panel = tm.makePanel() + + p1 = panel.ix[:, :5, :] + p2 = panel.ix[:, 5:, :] + + result = concat([p1, p2], axis=1) + tm.assert_panel_equal(result, panel) + + p1 = panel.ix[:, :, :2] + p2 = panel.ix[:, :, 2:] + + result = concat([p1, p2], axis=2) + tm.assert_panel_equal(result, panel) + + # if things are a bit misbehaved + p1 = panel.ix[:2, :, :2] + p2 = panel.ix[:, :, 2:] + p1['ItemC'] = 'baz' + + result = concat([p1, p2], axis=2) + + expected = panel.copy() + expected['ItemC'] = expected['ItemC'].astype('O') + expected.ix['ItemC', :, :2] = 'baz' + tm.assert_panel_equal(result, expected) + + def test_panel_concat_buglet(self): + # #2257 + def make_panel(): + index = 5 + cols = 3 + + def df(): + return DataFrame(np.random.randn(index, cols), + index=["I%s" % i for i in range(index)], + columns=["C%s" % i for i in range(cols)]) + return Panel(dict([("Item%s" % x, df()) for x in ['A', 'B', 'C']])) + + panel1 = make_panel() + panel2 = make_panel() + + panel2 = panel2.rename_axis(dict([(x, "%s_1" % x) + for x in panel2.major_axis]), + axis=1) + + panel3 = panel2.rename_axis(lambda x: '%s_1' % x, axis=1) + panel3 = panel3.rename_axis(lambda x: '%s_1' % x, axis=2) + + # it works! + concat([panel1, panel3], axis=1, verify_integrity=True) + + def test_panel4d_concat(self): + p4d = tm.makePanel4D() + + p1 = p4d.ix[:, :, :5, :] + p2 = p4d.ix[:, :, 5:, :] + + result = concat([p1, p2], axis=2) + tm.assert_panel4d_equal(result, p4d) + + p1 = p4d.ix[:, :, :, :2] + p2 = p4d.ix[:, :, :, 2:] + + result = concat([p1, p2], axis=3) + tm.assert_panel4d_equal(result, p4d) + + def test_panel4d_concat_mixed_type(self): + p4d = tm.makePanel4D() + + # if things are a bit misbehaved + p1 = p4d.ix[:, :2, :, :2] + p2 = p4d.ix[:, :, :, 2:] + p1['L5'] = 'baz' + + result = concat([p1, p2], axis=3) + + p2['L5'] = np.nan + expected = concat([p1, p2], axis=3) + expected = expected.ix[result.labels] + + tm.assert_panel4d_equal(result, expected) + + def test_concat_series(self): + + ts = tm.makeTimeSeries() + ts.name = 'foo' + + pieces = [ts[:5], ts[5:15], ts[15:]] + + result = concat(pieces) + tm.assert_series_equal(result, ts) + self.assertEqual(result.name, ts.name) + + result = concat(pieces, keys=[0, 1, 2]) + expected = ts.copy() + + ts.index = DatetimeIndex(np.array(ts.index.values, dtype='M8[ns]')) + + exp_labels = [np.repeat([0, 1, 2], [len(x) for x in pieces]), + np.arange(len(ts))] + exp_index = MultiIndex(levels=[[0, 1, 2], ts.index], + labels=exp_labels) + expected.index = exp_index + tm.assert_series_equal(result, expected) + + def test_concat_series_axis1(self): + ts = tm.makeTimeSeries() + + pieces = [ts[:-2], ts[2:], ts[2:-2]] + + result = concat(pieces, axis=1) + expected = DataFrame(pieces).T + assert_frame_equal(result, expected) + + result = concat(pieces, keys=['A', 'B', 'C'], axis=1) + expected = DataFrame(pieces, index=['A', 'B', 'C']).T + assert_frame_equal(result, expected) + + # preserve series names, #2489 + s = Series(randn(5), name='A') + s2 = Series(randn(5), name='B') + + result = concat([s, s2], axis=1) + expected = DataFrame({'A': s, 'B': s2}) + assert_frame_equal(result, expected) + + s2.name = None + result = concat([s, s2], axis=1) + self.assertTrue(np.array_equal( + result.columns, Index(['A', 0], dtype='object'))) + + # must reindex, #2603 + s = Series(randn(3), index=['c', 'a', 'b'], name='A') + s2 = Series(randn(4), index=['d', 'a', 'b', 'c'], name='B') + result = concat([s, s2], axis=1) + expected = DataFrame({'A': s, 'B': s2}) + assert_frame_equal(result, expected) + + def test_concat_single_with_key(self): + df = DataFrame(np.random.randn(10, 4)) + + result = concat([df], keys=['foo']) + expected = concat([df, df], keys=['foo', 'bar']) + tm.assert_frame_equal(result, expected[:10]) + + def test_concat_exclude_none(self): + df = DataFrame(np.random.randn(10, 4)) + + pieces = [df[:5], None, None, df[5:]] + result = concat(pieces) + tm.assert_frame_equal(result, df) + self.assertRaises(ValueError, concat, [None, None]) + + def test_concat_datetime64_block(self): + from pandas.tseries.index import date_range + + rng = date_range('1/1/2000', periods=10) + + df = DataFrame({'time': rng}) + + result = concat([df, df]) + self.assertTrue((result.iloc[:10]['time'] == rng).all()) + self.assertTrue((result.iloc[10:]['time'] == rng).all()) + + def test_concat_timedelta64_block(self): + from pandas import to_timedelta + + rng = to_timedelta(np.arange(10), unit='s') + + df = DataFrame({'time': rng}) + + result = concat([df, df]) + self.assertTrue((result.iloc[:10]['time'] == rng).all()) + self.assertTrue((result.iloc[10:]['time'] == rng).all()) + + def test_concat_keys_with_none(self): + # #1649 + df0 = DataFrame([[10, 20, 30], [10, 20, 30], [10, 20, 30]]) + + result = concat(dict(a=None, b=df0, c=df0[:2], d=df0[:1], e=df0)) + expected = concat(dict(b=df0, c=df0[:2], d=df0[:1], e=df0)) + tm.assert_frame_equal(result, expected) + + result = concat([None, df0, df0[:2], df0[:1], df0], + keys=['a', 'b', 'c', 'd', 'e']) + expected = concat([df0, df0[:2], df0[:1], df0], + keys=['b', 'c', 'd', 'e']) + tm.assert_frame_equal(result, expected) + + def test_concat_bug_1719(self): + ts1 = tm.makeTimeSeries() + ts2 = tm.makeTimeSeries()[::2] + + # to join with union + # these two are of different length! + left = concat([ts1, ts2], join='outer', axis=1) + right = concat([ts2, ts1], join='outer', axis=1) + + self.assertEqual(len(left), len(right)) + + def test_concat_bug_2972(self): + ts0 = Series(np.zeros(5)) + ts1 = Series(np.ones(5)) + ts0.name = ts1.name = 'same name' + result = concat([ts0, ts1], axis=1) + + expected = DataFrame({0: ts0, 1: ts1}) + expected.columns = ['same name', 'same name'] + assert_frame_equal(result, expected) + + def test_concat_bug_3602(self): + + # GH 3602, duplicate columns + df1 = DataFrame({'firmNo': [0, 0, 0, 0], 'stringvar': [ + 'rrr', 'rrr', 'rrr', 'rrr'], 'prc': [6, 6, 6, 6]}) + df2 = DataFrame({'misc': [1, 2, 3, 4], 'prc': [ + 6, 6, 6, 6], 'C': [9, 10, 11, 12]}) + expected = DataFrame([[0, 6, 'rrr', 9, 1, 6], + [0, 6, 'rrr', 10, 2, 6], + [0, 6, 'rrr', 11, 3, 6], + [0, 6, 'rrr', 12, 4, 6]]) + expected.columns = ['firmNo', 'prc', 'stringvar', 'C', 'misc', 'prc'] + + result = concat([df1, df2], axis=1) + assert_frame_equal(result, expected) + + def test_concat_series_axis1_same_names_ignore_index(self): + dates = date_range('01-Jan-2013', '01-Jan-2014', freq='MS')[0:-1] + s1 = Series(randn(len(dates)), index=dates, name='value') + s2 = Series(randn(len(dates)), index=dates, name='value') + + result = concat([s1, s2], axis=1, ignore_index=True) + self.assertTrue(np.array_equal(result.columns, [0, 1])) + + def test_concat_iterables(self): + from collections import deque, Iterable + + # GH8645 check concat works with tuples, list, generators, and weird + # stuff like deque and custom iterables + df1 = DataFrame([1, 2, 3]) + df2 = DataFrame([4, 5, 6]) + expected = DataFrame([1, 2, 3, 4, 5, 6]) + assert_frame_equal(concat((df1, df2), ignore_index=True), expected) + assert_frame_equal(concat([df1, df2], ignore_index=True), expected) + assert_frame_equal(concat((df for df in (df1, df2)), + ignore_index=True), expected) + assert_frame_equal( + concat(deque((df1, df2)), ignore_index=True), expected) + + class CustomIterator1(object): + + def __len__(self): + return 2 + + def __getitem__(self, index): + try: + return {0: df1, 1: df2}[index] + except KeyError: + raise IndexError + assert_frame_equal(pd.concat(CustomIterator1(), + ignore_index=True), expected) + + class CustomIterator2(Iterable): + + def __iter__(self): + yield df1 + yield df2 + assert_frame_equal(pd.concat(CustomIterator2(), + ignore_index=True), expected) + + def test_concat_invalid(self): + + # trying to concat a ndframe with a non-ndframe + df1 = mkdf(10, 2) + for obj in [1, dict(), [1, 2], (1, 2)]: + self.assertRaises(TypeError, lambda x: concat([df1, obj])) + + def test_concat_invalid_first_argument(self): + df1 = mkdf(10, 2) + df2 = mkdf(10, 2) + self.assertRaises(TypeError, concat, df1, df2) + + # generator ok though + concat(DataFrame(np.random.rand(5, 5)) for _ in range(3)) + + # text reader ok + # GH6583 + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + + reader = read_csv(StringIO(data), chunksize=1) + result = concat(reader, ignore_index=True) + expected = read_csv(StringIO(data)) + assert_frame_equal(result, expected) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 13f00afb5a489..2505309768997 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -9,20 +9,17 @@ import random import pandas as pd -from pandas.compat import range, lrange, lzip, StringIO -from pandas import compat -from pandas.tseries.index import DatetimeIndex -from pandas.tools.merge import merge, concat, ordered_merge, MergeError -from pandas import Categorical, Timestamp -from pandas.util.testing import (assert_frame_equal, assert_series_equal, - assert_almost_equal, - makeCustomDataframe as mkdf, - assertRaisesRegexp) -from pandas import (isnull, DataFrame, Index, MultiIndex, Panel, - Series, date_range, read_csv) +from pandas.compat import range, lrange, lzip +from pandas.tools.merge import merge, concat, MergeError +from pandas.util.testing import (assert_frame_equal, + assert_series_equal, + slow) +from pandas import (DataFrame, Index, MultiIndex, + Series, date_range, Categorical, + compat) import pandas.algos as algos import pandas.util.testing as tm -from numpy.testing.decorators import slow + a_ = np.array @@ -203,8 +200,10 @@ def test_join_on(self): source = self.source merged = target.join(source, on='C') - self.assert_numpy_array_equal(merged['MergedA'], target['A']) - self.assert_numpy_array_equal(merged['MergedD'], target['D']) + self.assert_series_equal(merged['MergedA'], target['A'], + check_names=False) + self.assert_series_equal(merged['MergedD'], target['D'], + check_names=False) # join with duplicates (fix regression from DataFrame/Matrix merge) df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) @@ -289,7 +288,7 @@ def test_join_with_len0(self): merged2 = self.target.join(self.source.reindex([]), on='C', how='inner') - self.assertTrue(merged2.columns.equals(merged.columns)) + self.assert_index_equal(merged2.columns, merged.columns) self.assertEqual(len(merged2), 0) def test_join_on_inner(self): @@ -300,9 +299,11 @@ def test_join_on_inner(self): expected = df.join(df2, on='key') expected = expected[expected['value'].notnull()] - self.assert_numpy_array_equal(joined['key'], expected['key']) - self.assert_numpy_array_equal(joined['value'], expected['value']) - self.assertTrue(joined.index.equals(expected.index)) + self.assert_series_equal(joined['key'], expected['key'], + check_dtype=False) + self.assert_series_equal(joined['value'], expected['value'], + check_dtype=False) + self.assert_index_equal(joined.index, expected.index) def test_join_on_singlekey_list(self): df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) @@ -509,11 +510,10 @@ def test_join_many_non_unique_index(self): expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer') result = result.reset_index() - - result['a'] = result['a'].astype(np.float64) - result['b'] = result['b'].astype(np.float64) - - assert_frame_equal(result, expected.ix[:, result.columns]) + expected = expected[result.columns] + expected['a'] = expected.a.astype('int64') + expected['b'] = expected.b.astype('int64') + assert_frame_equal(result, expected) df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]}) df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]}) @@ -666,7 +666,7 @@ def test_join_sort(self): # smoke test joined = left.join(right, on='key', sort=False) - self.assert_numpy_array_equal(joined.index, lrange(4)) + self.assert_index_equal(joined.index, pd.Index(lrange(4))) def test_intelligently_handle_join_key(self): # #733, be a bit more 1337 about not returning unconsolidated DataFrame @@ -677,14 +677,35 @@ def test_intelligently_handle_join_key(self): 'rvalue': lrange(6)}) joined = merge(left, right, on='key', how='outer') - expected = DataFrame({'key': [1, 1, 1, 1, 2, 2, 3, 4, 5.], + expected = DataFrame({'key': [1, 1, 1, 1, 2, 2, 3, 4, 5], 'value': np.array([0, 0, 1, 1, 2, 3, 4, np.nan, np.nan]), - 'rvalue': np.array([0, 1, 0, 1, 2, 2, 3, 4, 5])}, + 'rvalue': [0, 1, 0, 1, 2, 2, 3, 4, 5]}, columns=['value', 'key', 'rvalue']) - assert_frame_equal(joined, expected, check_dtype=False) + assert_frame_equal(joined, expected) + + def test_merge_join_key_dtype_cast(self): + # #8596 - self.assertTrue(joined._data.is_consolidated()) + df1 = DataFrame({'key': [1], 'v1': [10]}) + df2 = DataFrame({'key': [2], 'v1': [20]}) + df = merge(df1, df2, how='outer') + self.assertEqual(df['key'].dtype, 'int64') + + df1 = DataFrame({'key': [True], 'v1': [1]}) + df2 = DataFrame({'key': [False], 'v1': [0]}) + df = merge(df1, df2, how='outer') + + # GH13169 + # this really should be bool + self.assertEqual(df['key'].dtype, 'object') + + df1 = DataFrame({'val': [1]}) + df2 = DataFrame({'val': [2]}) + lkey = np.array([1]) + rkey = np.array([2]) + df = merge(df1, df2, left_on=lkey, right_on=rkey, how='outer') + self.assertEqual(df['key_0'].dtype, 'int64') def test_handle_join_key_pass_array(self): left = DataFrame({'key': [1, 1, 2, 2, 3], @@ -705,15 +726,16 @@ def test_handle_join_key_pass_array(self): rkey = np.array([1, 1, 2, 3, 4, 5]) merged = merge(left, right, left_on=lkey, right_on=rkey, how='outer') - self.assert_numpy_array_equal(merged['key_0'], - np.array([1, 1, 1, 1, 2, 2, 3, 4, 5])) + self.assert_series_equal(merged['key_0'], + Series([1, 1, 1, 1, 2, 2, 3, 4, 5], + name='key_0')) left = DataFrame({'value': lrange(3)}) right = DataFrame({'rvalue': lrange(6)}) - key = np.array([0, 1, 1, 2, 2, 3]) + key = np.array([0, 1, 1, 2, 2, 3], dtype=np.int64) merged = merge(left, right, left_index=True, right_on=key, how='outer') - self.assert_numpy_array_equal(merged['key_0'], key) + self.assert_series_equal(merged['key_0'], Series(key, name='key_0')) def test_mixed_type_join_with_suffix(self): # GH #916 @@ -817,20 +839,32 @@ def test_merge_left_empty_right_notempty(self): # result will have object dtype exp_in.index = exp_in.index.astype(object) - for kwarg in [dict(left_index=True, right_index=True), - dict(left_index=True, right_on='x'), - dict(left_on='a', right_index=True), - dict(left_on='a', right_on='x')]: - + def check1(exp, kwarg): result = pd.merge(left, right, how='inner', **kwarg) - tm.assert_frame_equal(result, exp_in) + tm.assert_frame_equal(result, exp) result = pd.merge(left, right, how='left', **kwarg) - tm.assert_frame_equal(result, exp_in) + tm.assert_frame_equal(result, exp) + def check2(exp, kwarg): result = pd.merge(left, right, how='right', **kwarg) - tm.assert_frame_equal(result, exp_out) + tm.assert_frame_equal(result, exp) result = pd.merge(left, right, how='outer', **kwarg) - tm.assert_frame_equal(result, exp_out) + tm.assert_frame_equal(result, exp) + + for kwarg in [dict(left_index=True, right_index=True), + dict(left_index=True, right_on='x')]: + check1(exp_in, kwarg) + check2(exp_out, kwarg) + + kwarg = dict(left_on='a', right_index=True) + check1(exp_in, kwarg) + exp_out['a'] = [0, 1, 2] + check2(exp_out, kwarg) + + kwarg = dict(left_on='a', right_on='x') + check1(exp_in, kwarg) + exp_out['a'] = np.array([np.nan] * 3, dtype=object) + check2(exp_out, kwarg) def test_merge_left_notempty_right_empty(self): # GH 10824 @@ -849,20 +883,24 @@ def test_merge_left_notempty_right_empty(self): # result will have object dtype exp_in.index = exp_in.index.astype(object) - for kwarg in [dict(left_index=True, right_index=True), - dict(left_index=True, right_on='x'), - dict(left_on='a', right_index=True), - dict(left_on='a', right_on='x')]: - + def check1(exp, kwarg): result = pd.merge(left, right, how='inner', **kwarg) - tm.assert_frame_equal(result, exp_in) + tm.assert_frame_equal(result, exp) result = pd.merge(left, right, how='right', **kwarg) - tm.assert_frame_equal(result, exp_in) + tm.assert_frame_equal(result, exp) + def check2(exp, kwarg): result = pd.merge(left, right, how='left', **kwarg) - tm.assert_frame_equal(result, exp_out) + tm.assert_frame_equal(result, exp) result = pd.merge(left, right, how='outer', **kwarg) - tm.assert_frame_equal(result, exp_out) + tm.assert_frame_equal(result, exp) + + for kwarg in [dict(left_index=True, right_index=True), + dict(left_index=True, right_on='x'), + dict(left_on='a', right_index=True), + dict(left_on='a', right_on='x')]: + check1(exp_in, kwarg) + check2(exp_out, kwarg) def test_merge_nosort(self): # #2098, anything to do? @@ -1064,7 +1102,7 @@ def test_merge_on_datetime64tz(self): tz='US/Eastern')) + [pd.NaT], 'value_y': [pd.NaT] + list(pd.date_range('20151011', periods=2, tz='US/Eastern')), - 'key': [1., 2, 3]}) + 'key': [1, 2, 3]}) result = pd.merge(left, right, on='key', how='outer') assert_frame_equal(result, expected) self.assertEqual(result['value_x'].dtype, 'datetime64[ns, US/Eastern]') @@ -1096,7 +1134,7 @@ def test_merge_on_periods(self): exp_y = pd.period_range('20151011', periods=2, freq='D') expected = DataFrame({'value_x': list(exp_x) + [pd.NaT], 'value_y': [pd.NaT] + list(exp_y), - 'key': [1., 2, 3]}) + 'key': [1, 2, 3]}) result = pd.merge(left, right, on='key', how='outer') assert_frame_equal(result, expected) self.assertEqual(result['value_x'].dtype, 'object') @@ -1136,6 +1174,15 @@ def test_concat_NaT_series(self): result = pd.concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) + def test_concat_tz_frame(self): + df2 = DataFrame(dict(A=pd.Timestamp('20130102', tz='US/Eastern'), + B=pd.Timestamp('20130603', tz='CET')), + index=range(5)) + + # concat + df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1) + assert_frame_equal(df2, df3) + def test_concat_tz_series(self): # GH 11755 # tz and no tz @@ -1329,7 +1376,7 @@ def test_indicator(self): 'col_conflict_x': [1, 2, np.nan, np.nan, np.nan, np.nan], 'col_left': ['a', 'b', np.nan, np.nan, np.nan, np.nan], 'col_conflict_y': [np.nan, 1, 2, 3, 4, 5], - 'col_right': [np.nan, 2, 2, 2, 2, 2]}, dtype='float64') + 'col_right': [np.nan, 2, 2, 2, 2, 2]}) df_result['_merge'] = Categorical( ['left_only', 'both', 'right_only', 'right_only', 'right_only', 'right_only'], @@ -1408,7 +1455,7 @@ def test_indicator(self): df4 = DataFrame({'col1': [1, 1, 3], 'col2': ['b', 'x', 'y']}) - hand_coded_result = DataFrame({'col1': [0, 1, 1, 3.0], + hand_coded_result = DataFrame({'col1': [0, 1, 1, 3], 'col2': ['a', 'b', 'x', 'y']}) hand_coded_result['_merge'] = Categorical( ['left_only', 'both', 'right_only', 'right_only'], @@ -2159,1100 +2206,6 @@ def _join_by_hand(a, b, how='left'): return a_re.reindex(columns=result_columns) -class TestConcatenate(tm.TestCase): - - _multiprocess_can_split_ = True - - def setUp(self): - self.frame = DataFrame(tm.getSeriesData()) - self.mixed_frame = self.frame.copy() - self.mixed_frame['foo'] = 'bar' - - def test_append(self): - begin_index = self.frame.index[:5] - end_index = self.frame.index[5:] - - begin_frame = self.frame.reindex(begin_index) - end_frame = self.frame.reindex(end_index) - - appended = begin_frame.append(end_frame) - assert_almost_equal(appended['A'], self.frame['A']) - - del end_frame['A'] - partial_appended = begin_frame.append(end_frame) - self.assertIn('A', partial_appended) - - partial_appended = end_frame.append(begin_frame) - self.assertIn('A', partial_appended) - - # mixed type handling - appended = self.mixed_frame[:5].append(self.mixed_frame[5:]) - assert_frame_equal(appended, self.mixed_frame) - - # what to test here - mixed_appended = self.mixed_frame[:5].append(self.frame[5:]) - mixed_appended2 = self.frame[:5].append(self.mixed_frame[5:]) - - # all equal except 'foo' column - assert_frame_equal( - mixed_appended.reindex(columns=['A', 'B', 'C', 'D']), - mixed_appended2.reindex(columns=['A', 'B', 'C', 'D'])) - - # append empty - empty = DataFrame({}) - - appended = self.frame.append(empty) - assert_frame_equal(self.frame, appended) - self.assertIsNot(appended, self.frame) - - appended = empty.append(self.frame) - assert_frame_equal(self.frame, appended) - self.assertIsNot(appended, self.frame) - - # overlap - self.assertRaises(ValueError, self.frame.append, self.frame, - verify_integrity=True) - - # new columns - # GH 6129 - df = DataFrame({'a': {'x': 1, 'y': 2}, 'b': {'x': 3, 'y': 4}}) - row = Series([5, 6, 7], index=['a', 'b', 'c'], name='z') - expected = DataFrame({'a': {'x': 1, 'y': 2, 'z': 5}, 'b': { - 'x': 3, 'y': 4, 'z': 6}, 'c': {'z': 7}}) - result = df.append(row) - assert_frame_equal(result, expected) - - def test_append_length0_frame(self): - df = DataFrame(columns=['A', 'B', 'C']) - df3 = DataFrame(index=[0, 1], columns=['A', 'B']) - df5 = df.append(df3) - - expected = DataFrame(index=[0, 1], columns=['A', 'B', 'C']) - assert_frame_equal(df5, expected) - - def test_append_records(self): - arr1 = np.zeros((2,), dtype=('i4,f4,a10')) - arr1[:] = [(1, 2., 'Hello'), (2, 3., "World")] - - arr2 = np.zeros((3,), dtype=('i4,f4,a10')) - arr2[:] = [(3, 4., 'foo'), - (5, 6., "bar"), - (7., 8., 'baz')] - - df1 = DataFrame(arr1) - df2 = DataFrame(arr2) - - result = df1.append(df2, ignore_index=True) - expected = DataFrame(np.concatenate((arr1, arr2))) - assert_frame_equal(result, expected) - - def test_append_different_columns(self): - df = DataFrame({'bools': np.random.randn(10) > 0, - 'ints': np.random.randint(0, 10, 10), - 'floats': np.random.randn(10), - 'strings': ['foo', 'bar'] * 5}) - - a = df[:5].ix[:, ['bools', 'ints', 'floats']] - b = df[5:].ix[:, ['strings', 'ints', 'floats']] - - appended = a.append(b) - self.assertTrue(isnull(appended['strings'][0:4]).all()) - self.assertTrue(isnull(appended['bools'][5:]).all()) - - def test_append_many(self): - chunks = [self.frame[:5], self.frame[5:10], - self.frame[10:15], self.frame[15:]] - - result = chunks[0].append(chunks[1:]) - tm.assert_frame_equal(result, self.frame) - - chunks[-1] = chunks[-1].copy() - chunks[-1]['foo'] = 'bar' - result = chunks[0].append(chunks[1:]) - tm.assert_frame_equal(result.ix[:, self.frame.columns], self.frame) - self.assertTrue((result['foo'][15:] == 'bar').all()) - self.assertTrue(result['foo'][:15].isnull().all()) - - def test_append_preserve_index_name(self): - # #980 - df1 = DataFrame(data=None, columns=['A', 'B', 'C']) - df1 = df1.set_index(['A']) - df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], - columns=['A', 'B', 'C']) - df2 = df2.set_index(['A']) - - result = df1.append(df2) - self.assertEqual(result.index.name, 'A') - - def test_join_many(self): - df = DataFrame(np.random.randn(10, 6), columns=list('abcdef')) - df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]] - - joined = df_list[0].join(df_list[1:]) - tm.assert_frame_equal(joined, df) - - df_list = [df[['a', 'b']][:-2], - df[['c', 'd']][2:], df[['e', 'f']][1:9]] - - def _check_diff_index(df_list, result, exp_index): - reindexed = [x.reindex(exp_index) for x in df_list] - expected = reindexed[0].join(reindexed[1:]) - tm.assert_frame_equal(result, expected) - - # different join types - joined = df_list[0].join(df_list[1:], how='outer') - _check_diff_index(df_list, joined, df.index) - - joined = df_list[0].join(df_list[1:]) - _check_diff_index(df_list, joined, df_list[0].index) - - joined = df_list[0].join(df_list[1:], how='inner') - _check_diff_index(df_list, joined, df.index[2:8]) - - self.assertRaises(ValueError, df_list[0].join, df_list[1:], on='a') - - def test_join_many_mixed(self): - df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D']) - df['key'] = ['foo', 'bar'] * 4 - df1 = df.ix[:, ['A', 'B']] - df2 = df.ix[:, ['C', 'D']] - df3 = df.ix[:, ['key']] - - result = df1.join([df2, df3]) - assert_frame_equal(result, df) - - def test_append_missing_column_proper_upcast(self): - df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8')}) - df2 = DataFrame({'B': np.array([True, False, True, False], - dtype=bool)}) - - appended = df1.append(df2, ignore_index=True) - self.assertEqual(appended['A'].dtype, 'f8') - self.assertEqual(appended['B'].dtype, 'O') - - def test_concat_copy(self): - - df = DataFrame(np.random.randn(4, 3)) - df2 = DataFrame(np.random.randint(0, 10, size=4).reshape(4, 1)) - df3 = DataFrame({5: 'foo'}, index=range(4)) - - # these are actual copies - result = concat([df, df2, df3], axis=1, copy=True) - for b in result._data.blocks: - self.assertIsNone(b.values.base) - - # these are the same - result = concat([df, df2, df3], axis=1, copy=False) - for b in result._data.blocks: - if b.is_float: - self.assertTrue( - b.values.base is df._data.blocks[0].values.base) - elif b.is_integer: - self.assertTrue( - b.values.base is df2._data.blocks[0].values.base) - elif b.is_object: - self.assertIsNotNone(b.values.base) - - # float block was consolidated - df4 = DataFrame(np.random.randn(4, 1)) - result = concat([df, df2, df3, df4], axis=1, copy=False) - for b in result._data.blocks: - if b.is_float: - self.assertIsNone(b.values.base) - elif b.is_integer: - self.assertTrue( - b.values.base is df2._data.blocks[0].values.base) - elif b.is_object: - self.assertIsNotNone(b.values.base) - - def test_concat_with_group_keys(self): - df = DataFrame(np.random.randn(4, 3)) - df2 = DataFrame(np.random.randn(4, 4)) - - # axis=0 - df = DataFrame(np.random.randn(3, 4)) - df2 = DataFrame(np.random.randn(4, 4)) - - result = concat([df, df2], keys=[0, 1]) - exp_index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1, 1], - [0, 1, 2, 0, 1, 2, 3]]) - expected = DataFrame(np.r_[df.values, df2.values], - index=exp_index) - tm.assert_frame_equal(result, expected) - - result = concat([df, df], keys=[0, 1]) - exp_index2 = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], - [0, 1, 2, 0, 1, 2]]) - expected = DataFrame(np.r_[df.values, df.values], - index=exp_index2) - tm.assert_frame_equal(result, expected) - - # axis=1 - df = DataFrame(np.random.randn(4, 3)) - df2 = DataFrame(np.random.randn(4, 4)) - - result = concat([df, df2], keys=[0, 1], axis=1) - expected = DataFrame(np.c_[df.values, df2.values], - columns=exp_index) - tm.assert_frame_equal(result, expected) - - result = concat([df, df], keys=[0, 1], axis=1) - expected = DataFrame(np.c_[df.values, df.values], - columns=exp_index2) - tm.assert_frame_equal(result, expected) - - def test_concat_keys_specific_levels(self): - df = DataFrame(np.random.randn(10, 4)) - pieces = [df.ix[:, [0, 1]], df.ix[:, [2]], df.ix[:, [3]]] - level = ['three', 'two', 'one', 'zero'] - result = concat(pieces, axis=1, keys=['one', 'two', 'three'], - levels=[level], - names=['group_key']) - - self.assert_numpy_array_equal(result.columns.levels[0], level) - self.assertEqual(result.columns.names[0], 'group_key') - - def test_concat_dataframe_keys_bug(self): - t1 = DataFrame({ - 'value': Series([1, 2, 3], index=Index(['a', 'b', 'c'], - name='id'))}) - t2 = DataFrame({ - 'value': Series([7, 8], index=Index(['a', 'b'], name='id'))}) - - # it works - result = concat([t1, t2], axis=1, keys=['t1', 't2']) - self.assertEqual(list(result.columns), [('t1', 'value'), - ('t2', 'value')]) - - def test_concat_series_partial_columns_names(self): - # GH10698 - foo = Series([1, 2], name='foo') - bar = Series([1, 2]) - baz = Series([4, 5]) - - result = concat([foo, bar, baz], axis=1) - expected = DataFrame({'foo': [1, 2], 0: [1, 2], 1: [ - 4, 5]}, columns=['foo', 0, 1]) - tm.assert_frame_equal(result, expected) - - result = concat([foo, bar, baz], axis=1, keys=[ - 'red', 'blue', 'yellow']) - expected = DataFrame({'red': [1, 2], 'blue': [1, 2], 'yellow': [ - 4, 5]}, columns=['red', 'blue', 'yellow']) - tm.assert_frame_equal(result, expected) - - result = concat([foo, bar, baz], axis=1, ignore_index=True) - expected = DataFrame({0: [1, 2], 1: [1, 2], 2: [4, 5]}) - tm.assert_frame_equal(result, expected) - - def test_concat_dict(self): - frames = {'foo': DataFrame(np.random.randn(4, 3)), - 'bar': DataFrame(np.random.randn(4, 3)), - 'baz': DataFrame(np.random.randn(4, 3)), - 'qux': DataFrame(np.random.randn(4, 3))} - - sorted_keys = sorted(frames) - - result = concat(frames) - expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys) - tm.assert_frame_equal(result, expected) - - result = concat(frames, axis=1) - expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys, - axis=1) - tm.assert_frame_equal(result, expected) - - keys = ['baz', 'foo', 'bar'] - result = concat(frames, keys=keys) - expected = concat([frames[k] for k in keys], keys=keys) - tm.assert_frame_equal(result, expected) - - def test_concat_ignore_index(self): - frame1 = DataFrame({"test1": ["a", "b", "c"], - "test2": [1, 2, 3], - "test3": [4.5, 3.2, 1.2]}) - frame2 = DataFrame({"test3": [5.2, 2.2, 4.3]}) - frame1.index = Index(["x", "y", "z"]) - frame2.index = Index(["x", "y", "q"]) - - v1 = concat([frame1, frame2], axis=1, ignore_index=True) - - nan = np.nan - expected = DataFrame([[nan, nan, nan, 4.3], - ['a', 1, 4.5, 5.2], - ['b', 2, 3.2, 2.2], - ['c', 3, 1.2, nan]], - index=Index(["q", "x", "y", "z"])) - - tm.assert_frame_equal(v1, expected) - - def test_concat_multiindex_with_keys(self): - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - frame = DataFrame(np.random.randn(10, 3), index=index, - columns=Index(['A', 'B', 'C'], name='exp')) - result = concat([frame, frame], keys=[0, 1], names=['iteration']) - - self.assertEqual(result.index.names, ('iteration',) + index.names) - tm.assert_frame_equal(result.ix[0], frame) - tm.assert_frame_equal(result.ix[1], frame) - self.assertEqual(result.index.nlevels, 3) - - def test_concat_multiindex_with_tz(self): - # GH 6606 - df = DataFrame({'dt': [datetime(2014, 1, 1), - datetime(2014, 1, 2), - datetime(2014, 1, 3)], - 'b': ['A', 'B', 'C'], - 'c': [1, 2, 3], 'd': [4, 5, 6]}) - df['dt'] = df['dt'].apply(lambda d: Timestamp(d, tz='US/Pacific')) - df = df.set_index(['dt', 'b']) - - exp_idx1 = DatetimeIndex(['2014-01-01', '2014-01-02', - '2014-01-03'] * 2, - tz='US/Pacific', name='dt') - exp_idx2 = Index(['A', 'B', 'C'] * 2, name='b') - exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) - expected = DataFrame({'c': [1, 2, 3] * 2, 'd': [4, 5, 6] * 2}, - index=exp_idx, columns=['c', 'd']) - - result = concat([df, df]) - tm.assert_frame_equal(result, expected) - - def test_concat_keys_and_levels(self): - df = DataFrame(np.random.randn(1, 3)) - df2 = DataFrame(np.random.randn(1, 4)) - - levels = [['foo', 'baz'], ['one', 'two']] - names = ['first', 'second'] - result = concat([df, df2, df, df2], - keys=[('foo', 'one'), ('foo', 'two'), - ('baz', 'one'), ('baz', 'two')], - levels=levels, - names=names) - expected = concat([df, df2, df, df2]) - exp_index = MultiIndex(levels=levels + [[0]], - labels=[[0, 0, 1, 1], [0, 1, 0, 1], - [0, 0, 0, 0]], - names=names + [None]) - expected.index = exp_index - - assert_frame_equal(result, expected) - - # no names - - result = concat([df, df2, df, df2], - keys=[('foo', 'one'), ('foo', 'two'), - ('baz', 'one'), ('baz', 'two')], - levels=levels) - self.assertEqual(result.index.names, (None,) * 3) - - # no levels - result = concat([df, df2, df, df2], - keys=[('foo', 'one'), ('foo', 'two'), - ('baz', 'one'), ('baz', 'two')], - names=['first', 'second']) - self.assertEqual(result.index.names, ('first', 'second') + (None,)) - self.assert_numpy_array_equal(result.index.levels[0], ['baz', 'foo']) - - def test_concat_keys_levels_no_overlap(self): - # GH #1406 - df = DataFrame(np.random.randn(1, 3), index=['a']) - df2 = DataFrame(np.random.randn(1, 4), index=['b']) - - self.assertRaises(ValueError, concat, [df, df], - keys=['one', 'two'], levels=[['foo', 'bar', 'baz']]) - - self.assertRaises(ValueError, concat, [df, df2], - keys=['one', 'two'], levels=[['foo', 'bar', 'baz']]) - - def test_concat_rename_index(self): - a = DataFrame(np.random.rand(3, 3), - columns=list('ABC'), - index=Index(list('abc'), name='index_a')) - b = DataFrame(np.random.rand(3, 3), - columns=list('ABC'), - index=Index(list('abc'), name='index_b')) - - result = concat([a, b], keys=['key0', 'key1'], - names=['lvl0', 'lvl1']) - - exp = concat([a, b], keys=['key0', 'key1'], names=['lvl0']) - names = list(exp.index.names) - names[1] = 'lvl1' - exp.index.set_names(names, inplace=True) - - tm.assert_frame_equal(result, exp) - self.assertEqual(result.index.names, exp.index.names) - - def test_crossed_dtypes_weird_corner(self): - columns = ['A', 'B', 'C', 'D'] - df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='f8'), - 'B': np.array([1, 2, 3, 4], dtype='i8'), - 'C': np.array([1, 2, 3, 4], dtype='f8'), - 'D': np.array([1, 2, 3, 4], dtype='i8')}, - columns=columns) - - df2 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8'), - 'B': np.array([1, 2, 3, 4], dtype='f8'), - 'C': np.array([1, 2, 3, 4], dtype='i8'), - 'D': np.array([1, 2, 3, 4], dtype='f8')}, - columns=columns) - - appended = df1.append(df2, ignore_index=True) - expected = DataFrame(np.concatenate([df1.values, df2.values], axis=0), - columns=columns) - tm.assert_frame_equal(appended, expected) - - df = DataFrame(np.random.randn(1, 3), index=['a']) - df2 = DataFrame(np.random.randn(1, 4), index=['b']) - result = concat( - [df, df2], keys=['one', 'two'], names=['first', 'second']) - self.assertEqual(result.index.names, ('first', 'second')) - - def test_dups_index(self): - # GH 4771 - - # single dtypes - df = DataFrame(np.random.randint(0, 10, size=40).reshape( - 10, 4), columns=['A', 'A', 'C', 'C']) - - result = concat([df, df], axis=1) - assert_frame_equal(result.iloc[:, :4], df) - assert_frame_equal(result.iloc[:, 4:], df) - - result = concat([df, df], axis=0) - assert_frame_equal(result.iloc[:10], df) - assert_frame_equal(result.iloc[10:], df) - - # multi dtypes - df = concat([DataFrame(np.random.randn(10, 4), - columns=['A', 'A', 'B', 'B']), - DataFrame(np.random.randint(0, 10, size=20) - .reshape(10, 2), - columns=['A', 'C'])], - axis=1) - - result = concat([df, df], axis=1) - assert_frame_equal(result.iloc[:, :6], df) - assert_frame_equal(result.iloc[:, 6:], df) - - result = concat([df, df], axis=0) - assert_frame_equal(result.iloc[:10], df) - assert_frame_equal(result.iloc[10:], df) - - # append - result = df.iloc[0:8, :].append(df.iloc[8:]) - assert_frame_equal(result, df) - - result = df.iloc[0:8, :].append(df.iloc[8:9]).append(df.iloc[9:10]) - assert_frame_equal(result, df) - - expected = concat([df, df], axis=0) - result = df.append(df) - assert_frame_equal(result, expected) - - def test_with_mixed_tuples(self): - # 10697 - # columns have mixed tuples, so handle properly - df1 = DataFrame({u'A': 'foo', (u'B', 1): 'bar'}, index=range(2)) - df2 = DataFrame({u'B': 'foo', (u'B', 1): 'bar'}, index=range(2)) - - # it works - concat([df1, df2]) - - def test_join_dups(self): - - # joining dups - df = concat([DataFrame(np.random.randn(10, 4), - columns=['A', 'A', 'B', 'B']), - DataFrame(np.random.randint(0, 10, size=20) - .reshape(10, 2), - columns=['A', 'C'])], - axis=1) - - expected = concat([df, df], axis=1) - result = df.join(df, rsuffix='_2') - result.columns = expected.columns - assert_frame_equal(result, expected) - - # GH 4975, invalid join on dups - w = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) - x = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) - y = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) - z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) - - dta = x.merge(y, left_index=True, right_index=True).merge( - z, left_index=True, right_index=True, how="outer") - dta = dta.merge(w, left_index=True, right_index=True) - expected = concat([x, y, z, w], axis=1) - expected.columns = ['x_x', 'y_x', 'x_y', - 'y_y', 'x_x', 'y_x', 'x_y', 'y_y'] - assert_frame_equal(dta, expected) - - def test_handle_empty_objects(self): - df = DataFrame(np.random.randn(10, 4), columns=list('abcd')) - - baz = df[:5].copy() - baz['foo'] = 'bar' - empty = df[5:5] - - frames = [baz, empty, empty, df[5:]] - concatted = concat(frames, axis=0) - - expected = df.ix[:, ['a', 'b', 'c', 'd', 'foo']] - expected['foo'] = expected['foo'].astype('O') - expected.loc[0:4, 'foo'] = 'bar' - - tm.assert_frame_equal(concatted, expected) - - # empty as first element with time series - # GH3259 - df = DataFrame(dict(A=range(10000)), index=date_range( - '20130101', periods=10000, freq='s')) - empty = DataFrame() - result = concat([df, empty], axis=1) - assert_frame_equal(result, df) - result = concat([empty, df], axis=1) - assert_frame_equal(result, df) - - result = concat([df, empty]) - assert_frame_equal(result, df) - result = concat([empty, df]) - assert_frame_equal(result, df) - - def test_concat_mixed_objs(self): - - # concat mixed series/frames - # G2385 - - # axis 1 - index = date_range('01-Jan-2013', periods=10, freq='H') - arr = np.arange(10, dtype='int64') - s1 = Series(arr, index=index) - s2 = Series(arr, index=index) - df = DataFrame(arr.reshape(-1, 1), index=index) - - expected = DataFrame(np.repeat(arr, 2).reshape(-1, 2), - index=index, columns=[0, 0]) - result = concat([df, df], axis=1) - assert_frame_equal(result, expected) - - expected = DataFrame(np.repeat(arr, 2).reshape(-1, 2), - index=index, columns=[0, 1]) - result = concat([s1, s2], axis=1) - assert_frame_equal(result, expected) - - expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), - index=index, columns=[0, 1, 2]) - result = concat([s1, s2, s1], axis=1) - assert_frame_equal(result, expected) - - expected = DataFrame(np.repeat(arr, 5).reshape(-1, 5), - index=index, columns=[0, 0, 1, 2, 3]) - result = concat([s1, df, s2, s2, s1], axis=1) - assert_frame_equal(result, expected) - - # with names - s1.name = 'foo' - expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), - index=index, columns=['foo', 0, 0]) - result = concat([s1, df, s2], axis=1) - assert_frame_equal(result, expected) - - s2.name = 'bar' - expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), - index=index, columns=['foo', 0, 'bar']) - result = concat([s1, df, s2], axis=1) - assert_frame_equal(result, expected) - - # ignore index - expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), - index=index, columns=[0, 1, 2]) - result = concat([s1, df, s2], axis=1, ignore_index=True) - assert_frame_equal(result, expected) - - # axis 0 - expected = DataFrame(np.tile(arr, 3).reshape(-1, 1), - index=index.tolist() * 3, columns=[0]) - result = concat([s1, df, s2]) - assert_frame_equal(result, expected) - - expected = DataFrame(np.tile(arr, 3).reshape(-1, 1), columns=[0]) - result = concat([s1, df, s2], ignore_index=True) - assert_frame_equal(result, expected) - - # invalid concatente of mixed dims - panel = tm.makePanel() - self.assertRaises(ValueError, lambda: concat([panel, s1], axis=1)) - - def test_panel_join(self): - panel = tm.makePanel() - tm.add_nans(panel) - - p1 = panel.ix[:2, :10, :3] - p2 = panel.ix[2:, 5:, 2:] - - # left join - result = p1.join(p2) - expected = p1.copy() - expected['ItemC'] = p2['ItemC'] - tm.assert_panel_equal(result, expected) - - # right join - result = p1.join(p2, how='right') - expected = p2.copy() - expected['ItemA'] = p1['ItemA'] - expected['ItemB'] = p1['ItemB'] - expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC']) - tm.assert_panel_equal(result, expected) - - # inner join - result = p1.join(p2, how='inner') - expected = panel.ix[:, 5:10, 2:3] - tm.assert_panel_equal(result, expected) - - # outer join - result = p1.join(p2, how='outer') - expected = p1.reindex(major=panel.major_axis, - minor=panel.minor_axis) - expected = expected.join(p2.reindex(major=panel.major_axis, - minor=panel.minor_axis)) - tm.assert_panel_equal(result, expected) - - def test_panel_join_overlap(self): - panel = tm.makePanel() - tm.add_nans(panel) - - p1 = panel.ix[['ItemA', 'ItemB', 'ItemC']] - p2 = panel.ix[['ItemB', 'ItemC']] - - # Expected index is - # - # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2 - joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2') - p1_suf = p1.ix[['ItemB', 'ItemC']].add_suffix('_p1') - p2_suf = p2.ix[['ItemB', 'ItemC']].add_suffix('_p2') - no_overlap = panel.ix[['ItemA']] - expected = no_overlap.join(p1_suf.join(p2_suf)) - tm.assert_panel_equal(joined, expected) - - def test_panel_join_many(self): - tm.K = 10 - panel = tm.makePanel() - tm.K = 4 - - panels = [panel.ix[:2], panel.ix[2:6], panel.ix[6:]] - - joined = panels[0].join(panels[1:]) - tm.assert_panel_equal(joined, panel) - - panels = [panel.ix[:2, :-5], panel.ix[2:6, 2:], panel.ix[6:, 5:-7]] - - data_dict = {} - for p in panels: - data_dict.update(p.iteritems()) - - joined = panels[0].join(panels[1:], how='inner') - expected = Panel.from_dict(data_dict, intersect=True) - tm.assert_panel_equal(joined, expected) - - joined = panels[0].join(panels[1:], how='outer') - expected = Panel.from_dict(data_dict, intersect=False) - tm.assert_panel_equal(joined, expected) - - # edge cases - self.assertRaises(ValueError, panels[0].join, panels[1:], - how='outer', lsuffix='foo', rsuffix='bar') - self.assertRaises(ValueError, panels[0].join, panels[1:], - how='right') - - def test_panel_concat_other_axes(self): - panel = tm.makePanel() - - p1 = panel.ix[:, :5, :] - p2 = panel.ix[:, 5:, :] - - result = concat([p1, p2], axis=1) - tm.assert_panel_equal(result, panel) - - p1 = panel.ix[:, :, :2] - p2 = panel.ix[:, :, 2:] - - result = concat([p1, p2], axis=2) - tm.assert_panel_equal(result, panel) - - # if things are a bit misbehaved - p1 = panel.ix[:2, :, :2] - p2 = panel.ix[:, :, 2:] - p1['ItemC'] = 'baz' - - result = concat([p1, p2], axis=2) - - expected = panel.copy() - expected['ItemC'] = expected['ItemC'].astype('O') - expected.ix['ItemC', :, :2] = 'baz' - tm.assert_panel_equal(result, expected) - - def test_panel_concat_buglet(self): - # #2257 - def make_panel(): - index = 5 - cols = 3 - - def df(): - return DataFrame(np.random.randn(index, cols), - index=["I%s" % i for i in range(index)], - columns=["C%s" % i for i in range(cols)]) - return Panel(dict([("Item%s" % x, df()) for x in ['A', 'B', 'C']])) - - panel1 = make_panel() - panel2 = make_panel() - - panel2 = panel2.rename_axis(dict([(x, "%s_1" % x) - for x in panel2.major_axis]), - axis=1) - - panel3 = panel2.rename_axis(lambda x: '%s_1' % x, axis=1) - panel3 = panel3.rename_axis(lambda x: '%s_1' % x, axis=2) - - # it works! - concat([panel1, panel3], axis=1, verify_integrity=True) - - def test_panel4d_concat(self): - p4d = tm.makePanel4D() - - p1 = p4d.ix[:, :, :5, :] - p2 = p4d.ix[:, :, 5:, :] - - result = concat([p1, p2], axis=2) - tm.assert_panel4d_equal(result, p4d) - - p1 = p4d.ix[:, :, :, :2] - p2 = p4d.ix[:, :, :, 2:] - - result = concat([p1, p2], axis=3) - tm.assert_panel4d_equal(result, p4d) - - def test_panel4d_concat_mixed_type(self): - p4d = tm.makePanel4D() - - # if things are a bit misbehaved - p1 = p4d.ix[:, :2, :, :2] - p2 = p4d.ix[:, :, :, 2:] - p1['L5'] = 'baz' - - result = concat([p1, p2], axis=3) - - p2['L5'] = np.nan - expected = concat([p1, p2], axis=3) - expected = expected.ix[result.labels] - - tm.assert_panel4d_equal(result, expected) - - def test_concat_series(self): - - ts = tm.makeTimeSeries() - ts.name = 'foo' - - pieces = [ts[:5], ts[5:15], ts[15:]] - - result = concat(pieces) - tm.assert_series_equal(result, ts) - self.assertEqual(result.name, ts.name) - - result = concat(pieces, keys=[0, 1, 2]) - expected = ts.copy() - - ts.index = DatetimeIndex(np.array(ts.index.values, dtype='M8[ns]')) - - exp_labels = [np.repeat([0, 1, 2], [len(x) for x in pieces]), - np.arange(len(ts))] - exp_index = MultiIndex(levels=[[0, 1, 2], ts.index], - labels=exp_labels) - expected.index = exp_index - tm.assert_series_equal(result, expected) - - def test_concat_series_axis1(self): - ts = tm.makeTimeSeries() - - pieces = [ts[:-2], ts[2:], ts[2:-2]] - - result = concat(pieces, axis=1) - expected = DataFrame(pieces).T - assert_frame_equal(result, expected) - - result = concat(pieces, keys=['A', 'B', 'C'], axis=1) - expected = DataFrame(pieces, index=['A', 'B', 'C']).T - assert_frame_equal(result, expected) - - # preserve series names, #2489 - s = Series(randn(5), name='A') - s2 = Series(randn(5), name='B') - - result = concat([s, s2], axis=1) - expected = DataFrame({'A': s, 'B': s2}) - assert_frame_equal(result, expected) - - s2.name = None - result = concat([s, s2], axis=1) - self.assertTrue(np.array_equal( - result.columns, Index(['A', 0], dtype='object'))) - - # must reindex, #2603 - s = Series(randn(3), index=['c', 'a', 'b'], name='A') - s2 = Series(randn(4), index=['d', 'a', 'b', 'c'], name='B') - result = concat([s, s2], axis=1) - expected = DataFrame({'A': s, 'B': s2}) - assert_frame_equal(result, expected) - - def test_concat_single_with_key(self): - df = DataFrame(np.random.randn(10, 4)) - - result = concat([df], keys=['foo']) - expected = concat([df, df], keys=['foo', 'bar']) - tm.assert_frame_equal(result, expected[:10]) - - def test_concat_exclude_none(self): - df = DataFrame(np.random.randn(10, 4)) - - pieces = [df[:5], None, None, df[5:]] - result = concat(pieces) - tm.assert_frame_equal(result, df) - self.assertRaises(ValueError, concat, [None, None]) - - def test_concat_datetime64_block(self): - from pandas.tseries.index import date_range - - rng = date_range('1/1/2000', periods=10) - - df = DataFrame({'time': rng}) - - result = concat([df, df]) - self.assertTrue((result.iloc[:10]['time'] == rng).all()) - self.assertTrue((result.iloc[10:]['time'] == rng).all()) - - def test_concat_timedelta64_block(self): - from pandas import to_timedelta - - rng = to_timedelta(np.arange(10), unit='s') - - df = DataFrame({'time': rng}) - - result = concat([df, df]) - self.assertTrue((result.iloc[:10]['time'] == rng).all()) - self.assertTrue((result.iloc[10:]['time'] == rng).all()) - - def test_concat_keys_with_none(self): - # #1649 - df0 = DataFrame([[10, 20, 30], [10, 20, 30], [10, 20, 30]]) - - result = concat(dict(a=None, b=df0, c=df0[:2], d=df0[:1], e=df0)) - expected = concat(dict(b=df0, c=df0[:2], d=df0[:1], e=df0)) - tm.assert_frame_equal(result, expected) - - result = concat([None, df0, df0[:2], df0[:1], df0], - keys=['a', 'b', 'c', 'd', 'e']) - expected = concat([df0, df0[:2], df0[:1], df0], - keys=['b', 'c', 'd', 'e']) - tm.assert_frame_equal(result, expected) - - def test_concat_bug_1719(self): - ts1 = tm.makeTimeSeries() - ts2 = tm.makeTimeSeries()[::2] - - # to join with union - # these two are of different length! - left = concat([ts1, ts2], join='outer', axis=1) - right = concat([ts2, ts1], join='outer', axis=1) - - self.assertEqual(len(left), len(right)) - - def test_concat_bug_2972(self): - ts0 = Series(np.zeros(5)) - ts1 = Series(np.ones(5)) - ts0.name = ts1.name = 'same name' - result = concat([ts0, ts1], axis=1) - - expected = DataFrame({0: ts0, 1: ts1}) - expected.columns = ['same name', 'same name'] - assert_frame_equal(result, expected) - - def test_concat_bug_3602(self): - - # GH 3602, duplicate columns - df1 = DataFrame({'firmNo': [0, 0, 0, 0], 'stringvar': [ - 'rrr', 'rrr', 'rrr', 'rrr'], 'prc': [6, 6, 6, 6]}) - df2 = DataFrame({'misc': [1, 2, 3, 4], 'prc': [ - 6, 6, 6, 6], 'C': [9, 10, 11, 12]}) - expected = DataFrame([[0, 6, 'rrr', 9, 1, 6], - [0, 6, 'rrr', 10, 2, 6], - [0, 6, 'rrr', 11, 3, 6], - [0, 6, 'rrr', 12, 4, 6]]) - expected.columns = ['firmNo', 'prc', 'stringvar', 'C', 'misc', 'prc'] - - result = concat([df1, df2], axis=1) - assert_frame_equal(result, expected) - - def test_concat_series_axis1_same_names_ignore_index(self): - dates = date_range('01-Jan-2013', '01-Jan-2014', freq='MS')[0:-1] - s1 = Series(randn(len(dates)), index=dates, name='value') - s2 = Series(randn(len(dates)), index=dates, name='value') - - result = concat([s1, s2], axis=1, ignore_index=True) - self.assertTrue(np.array_equal(result.columns, [0, 1])) - - def test_concat_iterables(self): - from collections import deque, Iterable - - # GH8645 check concat works with tuples, list, generators, and weird - # stuff like deque and custom iterables - df1 = DataFrame([1, 2, 3]) - df2 = DataFrame([4, 5, 6]) - expected = DataFrame([1, 2, 3, 4, 5, 6]) - assert_frame_equal(concat((df1, df2), ignore_index=True), expected) - assert_frame_equal(concat([df1, df2], ignore_index=True), expected) - assert_frame_equal(concat((df for df in (df1, df2)), - ignore_index=True), expected) - assert_frame_equal( - concat(deque((df1, df2)), ignore_index=True), expected) - - class CustomIterator1(object): - - def __len__(self): - return 2 - - def __getitem__(self, index): - try: - return {0: df1, 1: df2}[index] - except KeyError: - raise IndexError - assert_frame_equal(pd.concat(CustomIterator1(), - ignore_index=True), expected) - - class CustomIterator2(Iterable): - - def __iter__(self): - yield df1 - yield df2 - assert_frame_equal(pd.concat(CustomIterator2(), - ignore_index=True), expected) - - def test_concat_invalid(self): - - # trying to concat a ndframe with a non-ndframe - df1 = mkdf(10, 2) - for obj in [1, dict(), [1, 2], (1, 2)]: - self.assertRaises(TypeError, lambda x: concat([df1, obj])) - - def test_concat_invalid_first_argument(self): - df1 = mkdf(10, 2) - df2 = mkdf(10, 2) - self.assertRaises(TypeError, concat, df1, df2) - - # generator ok though - concat(DataFrame(np.random.rand(5, 5)) for _ in range(3)) - - # text reader ok - # GH6583 - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - - reader = read_csv(StringIO(data), chunksize=1) - result = concat(reader, ignore_index=True) - expected = read_csv(StringIO(data)) - assert_frame_equal(result, expected) - - -class TestOrderedMerge(tm.TestCase): - - def setUp(self): - self.left = DataFrame({'key': ['a', 'c', 'e'], - 'lvalue': [1, 2., 3]}) - - self.right = DataFrame({'key': ['b', 'c', 'd', 'f'], - 'rvalue': [1, 2, 3., 4]}) - - # GH #813 - - def test_basic(self): - result = ordered_merge(self.left, self.right, on='key') - expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], - 'lvalue': [1, nan, 2, nan, 3, nan], - 'rvalue': [nan, 1, 2, 3, nan, 4]}) - - assert_frame_equal(result, expected) - - def test_ffill(self): - result = ordered_merge( - self.left, self.right, on='key', fill_method='ffill') - expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], - 'lvalue': [1., 1, 2, 2, 3, 3.], - 'rvalue': [nan, 1, 2, 3, 3, 4]}) - assert_frame_equal(result, expected) - - def test_multigroup(self): - left = concat([self.left, self.left], ignore_index=True) - # right = concat([self.right, self.right], ignore_index=True) - - left['group'] = ['a'] * 3 + ['b'] * 3 - # right['group'] = ['a'] * 4 + ['b'] * 4 - - result = ordered_merge(left, self.right, on='key', left_by='group', - fill_method='ffill') - expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'] * 2, - 'lvalue': [1., 1, 2, 2, 3, 3.] * 2, - 'rvalue': [nan, 1, 2, 3, 3, 4] * 2}) - expected['group'] = ['a'] * 6 + ['b'] * 6 - - assert_frame_equal(result, expected.ix[:, result.columns]) - - result2 = ordered_merge(self.right, left, on='key', right_by='group', - fill_method='ffill') - assert_frame_equal(result, result2.ix[:, result.columns]) - - result = ordered_merge(left, self.right, on='key', left_by='group') - self.assertTrue(result['group'].notnull().all()) - - def test_merge_type(self): - class NotADataFrame(DataFrame): - - @property - def _constructor(self): - return NotADataFrame - - nad = NotADataFrame(self.left) - result = nad.merge(self.right, on='key') - - tm.assertIsInstance(result, NotADataFrame) - - def test_empty_sequence_concat(self): - # GH 9157 - empty_pat = "[Nn]o objects" - none_pat = "objects.*None" - test_cases = [ - ((), empty_pat), - ([], empty_pat), - ({}, empty_pat), - ([None], none_pat), - ([None, None], none_pat) - ] - for df_seq, pattern in test_cases: - assertRaisesRegexp(ValueError, pattern, pd.concat, df_seq) - - pd.concat([pd.DataFrame()]) - pd.concat([None, pd.DataFrame()]) - pd.concat([pd.DataFrame(), None]) - if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tools/tests/test_ordered_merge.py b/pandas/tools/tests/test_ordered_merge.py new file mode 100644 index 0000000000000..53f00d9761f32 --- /dev/null +++ b/pandas/tools/tests/test_ordered_merge.py @@ -0,0 +1,93 @@ +import nose + +import pandas as pd +from pandas import DataFrame, ordered_merge +from pandas.util import testing as tm +from pandas.util.testing import assert_frame_equal + +from numpy import nan + + +class TestOrderedMerge(tm.TestCase): + + def setUp(self): + self.left = DataFrame({'key': ['a', 'c', 'e'], + 'lvalue': [1, 2., 3]}) + + self.right = DataFrame({'key': ['b', 'c', 'd', 'f'], + 'rvalue': [1, 2, 3., 4]}) + + # GH #813 + + def test_basic(self): + result = ordered_merge(self.left, self.right, on='key') + expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], + 'lvalue': [1, nan, 2, nan, 3, nan], + 'rvalue': [nan, 1, 2, 3, nan, 4]}) + + assert_frame_equal(result, expected) + + def test_ffill(self): + result = ordered_merge( + self.left, self.right, on='key', fill_method='ffill') + expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], + 'lvalue': [1., 1, 2, 2, 3, 3.], + 'rvalue': [nan, 1, 2, 3, 3, 4]}) + assert_frame_equal(result, expected) + + def test_multigroup(self): + left = pd.concat([self.left, self.left], ignore_index=True) + # right = concat([self.right, self.right], ignore_index=True) + + left['group'] = ['a'] * 3 + ['b'] * 3 + # right['group'] = ['a'] * 4 + ['b'] * 4 + + result = ordered_merge(left, self.right, on='key', left_by='group', + fill_method='ffill') + expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'] * 2, + 'lvalue': [1., 1, 2, 2, 3, 3.] * 2, + 'rvalue': [nan, 1, 2, 3, 3, 4] * 2}) + expected['group'] = ['a'] * 6 + ['b'] * 6 + + assert_frame_equal(result, expected.ix[:, result.columns]) + + result2 = ordered_merge(self.right, left, on='key', right_by='group', + fill_method='ffill') + assert_frame_equal(result, result2.ix[:, result.columns]) + + result = ordered_merge(left, self.right, on='key', left_by='group') + self.assertTrue(result['group'].notnull().all()) + + def test_merge_type(self): + class NotADataFrame(DataFrame): + + @property + def _constructor(self): + return NotADataFrame + + nad = NotADataFrame(self.left) + result = nad.merge(self.right, on='key') + + tm.assertIsInstance(result, NotADataFrame) + + def test_empty_sequence_concat(self): + # GH 9157 + empty_pat = "[Nn]o objects" + none_pat = "objects.*None" + test_cases = [ + ((), empty_pat), + ([], empty_pat), + ({}, empty_pat), + ([None], none_pat), + ([None, None], none_pat) + ] + for df_seq, pattern in test_cases: + tm.assertRaisesRegexp(ValueError, pattern, pd.concat, df_seq) + + pd.concat([pd.DataFrame()]) + pd.concat([None, pd.DataFrame()]) + pd.concat([pd.DataFrame(), None]) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index 5ebd2e4f693cf..82feaae13f771 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -1,13 +1,12 @@ from datetime import datetime, date, timedelta import numpy as np -from numpy.testing import assert_equal import pandas as pd from pandas import DataFrame, Series, Index, MultiIndex, Grouper from pandas.tools.merge import concat from pandas.tools.pivot import pivot_table, crosstab -from pandas.compat import range, u, product +from pandas.compat import range, product import pandas.util.testing as tm @@ -80,21 +79,13 @@ def test_pivot_table_dropna(self): pv_ind = df.pivot_table( 'quantity', ['customer', 'product'], 'month', dropna=False) - m = MultiIndex.from_tuples([(u('A'), u('a')), - (u('A'), u('b')), - (u('A'), u('c')), - (u('A'), u('d')), - (u('B'), u('a')), - (u('B'), u('b')), - (u('B'), u('c')), - (u('B'), u('d')), - (u('C'), u('a')), - (u('C'), u('b')), - (u('C'), u('c')), - (u('C'), u('d'))]) - - assert_equal(pv_col.columns.values, m.values) - assert_equal(pv_ind.index.values, m.values) + m = MultiIndex.from_tuples([('A', 'a'), ('A', 'b'), ('A', 'c'), + ('A', 'd'), ('B', 'a'), ('B', 'b'), + ('B', 'c'), ('B', 'd'), ('C', 'a'), + ('C', 'b'), ('C', 'c'), ('C', 'd')], + names=['customer', 'product']) + tm.assert_index_equal(pv_col.columns, m) + tm.assert_index_equal(pv_ind.index, m) def test_pass_array(self): result = self.data.pivot_table( @@ -902,8 +893,9 @@ def test_crosstab_dropna(self): res = pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'], dropna=False) m = MultiIndex.from_tuples([('one', 'dull'), ('one', 'shiny'), - ('two', 'dull'), ('two', 'shiny')]) - assert_equal(res.columns.values, m.values) + ('two', 'dull'), ('two', 'shiny')], + names=['b', 'c']) + tm.assert_index_equal(res.columns, m) def test_categorical_margins(self): # GH 10989 diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index 55f27e1466a92..0b91fd1ef1c02 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -4,7 +4,7 @@ import numpy as np from pandas.compat import zip -from pandas import Series +from pandas import Series, Index import pandas.util.testing as tm from pandas.util.testing import assertRaisesRegexp import pandas.core.common as com @@ -19,32 +19,41 @@ class TestCut(tm.TestCase): def test_simple(self): data = np.ones(5) result = cut(data, 4, labels=False) - desired = [1, 1, 1, 1, 1] + desired = np.array([1, 1, 1, 1, 1], dtype=np.int64) tm.assert_numpy_array_equal(result, desired) def test_bins(self): data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]) result, bins = cut(data, 3, retbins=True) - tm.assert_numpy_array_equal(result.codes, [0, 0, 0, 1, 2, 0]) - tm.assert_almost_equal(bins, [0.1905, 3.36666667, 6.53333333, 9.7]) + + exp_codes = np.array([0, 0, 0, 1, 2, 0], dtype=np.int8) + tm.assert_numpy_array_equal(result.codes, exp_codes) + exp = np.array([0.1905, 3.36666667, 6.53333333, 9.7]) + tm.assert_almost_equal(bins, exp) def test_right(self): data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) result, bins = cut(data, 4, right=True, retbins=True) - tm.assert_numpy_array_equal(result.codes, [0, 0, 0, 2, 3, 0, 0]) - tm.assert_almost_equal(bins, [0.1905, 2.575, 4.95, 7.325, 9.7]) + exp_codes = np.array([0, 0, 0, 2, 3, 0, 0], dtype=np.int8) + tm.assert_numpy_array_equal(result.codes, exp_codes) + exp = np.array([0.1905, 2.575, 4.95, 7.325, 9.7]) + tm.assert_numpy_array_equal(bins, exp) def test_noright(self): data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) result, bins = cut(data, 4, right=False, retbins=True) - tm.assert_numpy_array_equal(result.codes, [0, 0, 0, 2, 3, 0, 1]) - tm.assert_almost_equal(bins, [0.2, 2.575, 4.95, 7.325, 9.7095]) + exp_codes = np.array([0, 0, 0, 2, 3, 0, 1], dtype=np.int8) + tm.assert_numpy_array_equal(result.codes, exp_codes) + exp = np.array([0.2, 2.575, 4.95, 7.325, 9.7095]) + tm.assert_almost_equal(bins, exp) def test_arraylike(self): data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] result, bins = cut(data, 3, retbins=True) - tm.assert_numpy_array_equal(result.codes, [0, 0, 0, 1, 2, 0]) - tm.assert_almost_equal(bins, [0.1905, 3.36666667, 6.53333333, 9.7]) + exp_codes = np.array([0, 0, 0, 1, 2, 0], dtype=np.int8) + tm.assert_numpy_array_equal(result.codes, exp_codes) + exp = np.array([0.1905, 3.36666667, 6.53333333, 9.7]) + tm.assert_almost_equal(bins, exp) def test_bins_not_monotonic(self): data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] @@ -72,14 +81,14 @@ def test_labels(self): arr = np.tile(np.arange(0, 1.01, 0.1), 4) result, bins = cut(arr, 4, retbins=True) - ex_levels = ['(-0.001, 0.25]', '(0.25, 0.5]', '(0.5, 0.75]', - '(0.75, 1]'] - self.assert_numpy_array_equal(result.categories, ex_levels) + ex_levels = Index(['(-0.001, 0.25]', '(0.25, 0.5]', '(0.5, 0.75]', + '(0.75, 1]']) + self.assert_index_equal(result.categories, ex_levels) result, bins = cut(arr, 4, retbins=True, right=False) - ex_levels = ['[0, 0.25)', '[0.25, 0.5)', '[0.5, 0.75)', - '[0.75, 1.001)'] - self.assert_numpy_array_equal(result.categories, ex_levels) + ex_levels = Index(['[0, 0.25)', '[0.25, 0.5)', '[0.5, 0.75)', + '[0.75, 1.001)']) + self.assert_index_equal(result.categories, ex_levels) def test_cut_pass_series_name_to_factor(self): s = Series(np.random.randn(100), name='foo') @@ -91,9 +100,9 @@ def test_label_precision(self): arr = np.arange(0, 0.73, 0.01) result = cut(arr, 4, precision=2) - ex_levels = ['(-0.00072, 0.18]', '(0.18, 0.36]', '(0.36, 0.54]', - '(0.54, 0.72]'] - self.assert_numpy_array_equal(result.categories, ex_levels) + ex_levels = Index(['(-0.00072, 0.18]', '(0.18, 0.36]', + '(0.36, 0.54]', '(0.54, 0.72]']) + self.assert_index_equal(result.categories, ex_levels) def test_na_handling(self): arr = np.arange(0, 0.75, 0.01) @@ -118,10 +127,10 @@ def test_inf_handling(self): result = cut(data, [-np.inf, 2, 4, np.inf]) result_ser = cut(data_ser, [-np.inf, 2, 4, np.inf]) - ex_categories = ['(-inf, 2]', '(2, 4]', '(4, inf]'] + ex_categories = Index(['(-inf, 2]', '(2, 4]', '(4, inf]']) - tm.assert_numpy_array_equal(result.categories, ex_categories) - tm.assert_numpy_array_equal(result_ser.cat.categories, ex_categories) + tm.assert_index_equal(result.categories, ex_categories) + tm.assert_index_equal(result_ser.cat.categories, ex_categories) self.assertEqual(result[5], '(4, inf]') self.assertEqual(result[0], '(-inf, 2]') self.assertEqual(result_ser[5], '(4, inf]') @@ -135,7 +144,7 @@ def test_qcut(self): tm.assert_almost_equal(bins, ex_bins) ex_levels = cut(arr, ex_bins, include_lowest=True) - self.assert_numpy_array_equal(labels, ex_levels) + self.assert_categorical_equal(labels, ex_levels) def test_qcut_bounds(self): arr = np.random.randn(1000) @@ -148,7 +157,7 @@ def test_qcut_specify_quantiles(self): factor = qcut(arr, [0, .25, .5, .75, 1.]) expected = qcut(arr, 4) - self.assertTrue(factor.equals(expected)) + tm.assert_categorical_equal(factor, expected) def test_qcut_all_bins_same(self): assertRaisesRegexp(ValueError, "edges.*unique", qcut, @@ -173,7 +182,7 @@ def test_cut_pass_labels(self): exp = cut(arr, bins) exp.categories = labels - self.assertTrue(result.equals(exp)) + tm.assert_categorical_equal(result, exp) def test_qcut_include_lowest(self): values = np.arange(10) @@ -253,12 +262,14 @@ def test_series_retbins(self): # GH 8589 s = Series(np.arange(4)) result, bins = cut(s, 2, retbins=True) - tm.assert_numpy_array_equal(result.cat.codes.values, [0, 0, 1, 1]) - tm.assert_almost_equal(bins, [-0.003, 1.5, 3]) + tm.assert_numpy_array_equal(result.cat.codes.values, + np.array([0, 0, 1, 1], dtype=np.int8)) + tm.assert_numpy_array_equal(bins, np.array([-0.003, 1.5, 3])) result, bins = qcut(s, 2, retbins=True) - tm.assert_numpy_array_equal(result.cat.codes.values, [0, 0, 1, 1]) - tm.assert_almost_equal(bins, [0, 1.5, 3]) + tm.assert_numpy_array_equal(result.cat.codes.values, + np.array([0, 0, 1, 1], dtype=np.int8)) + tm.assert_numpy_array_equal(bins, np.array([0, 1.5, 3])) def curpath(): diff --git a/pandas/tools/tests/test_util.py b/pandas/tools/tests/test_util.py index 1c4f55b2defa4..c592b33bdab9a 100644 --- a/pandas/tools/tests/test_util.py +++ b/pandas/tools/tests/test_util.py @@ -4,7 +4,6 @@ import nose import numpy as np -from numpy.testing import assert_equal import pandas as pd from pandas import date_range, Index @@ -19,18 +18,21 @@ class TestCartesianProduct(tm.TestCase): def test_simple(self): x, y = list('ABC'), [1, 22] - result = cartesian_product([x, y]) - expected = [np.array(['A', 'A', 'B', 'B', 'C', 'C']), - np.array([1, 22, 1, 22, 1, 22])] - assert_equal(result, expected) + result1, result2 = cartesian_product([x, y]) + expected1 = np.array(['A', 'A', 'B', 'B', 'C', 'C']) + expected2 = np.array([1, 22, 1, 22, 1, 22]) + tm.assert_numpy_array_equal(result1, expected1) + tm.assert_numpy_array_equal(result2, expected2) def test_datetimeindex(self): # regression test for GitHub issue #6439 # make sure that the ordering on datetimeindex is consistent x = date_range('2000-01-01', periods=2) - result = [Index(y).day for y in cartesian_product([x, x])] - expected = [np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])] - assert_equal(result, expected) + result1, result2 = [Index(y).day for y in cartesian_product([x, x])] + expected1 = np.array([1, 1, 2, 2], dtype=np.int32) + expected2 = np.array([1, 2, 1, 2], dtype=np.int32) + tm.assert_numpy_array_equal(result1, expected1) + tm.assert_numpy_array_equal(result2, expected2) class TestLocaleUtils(tm.TestCase): @@ -277,6 +279,18 @@ def test_period(self): # res = pd.to_numeric(pd.Series(idx, name='xxx')) # tm.assert_series_equal(res, pd.Series(idx.asi8, name='xxx')) + def test_non_hashable(self): + # Test for Bug #13324 + s = pd.Series([[10.0, 2], 1.0, 'apple']) + res = pd.to_numeric(s, errors='coerce') + tm.assert_series_equal(res, pd.Series([np.nan, 1.0, np.nan])) + + res = pd.to_numeric(s, errors='ignore') + tm.assert_series_equal(res, pd.Series([[10.0, 2], 1.0, 'apple'])) + + with self.assertRaisesRegexp(TypeError, "Invalid object type"): + pd.to_numeric(s) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 0f58d17f0ade4..42631d442a990 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -9,6 +9,7 @@ from pandas.compat.numpy import function as nv import numpy as np + from pandas.core import common as com, algorithms from pandas.core.common import (is_integer, is_float, is_bool_dtype, AbstractMethodError) @@ -74,22 +75,16 @@ def _round(self, freq, rounder): unit = to_offset(freq).nanos # round the local times - if getattr(self, 'tz', None) is not None: - values = self.tz_localize(None).asi8 - else: - values = self.asi8 + values = _ensure_datetimelike_to_i8(self) + result = (unit * rounder(values / float(unit))).astype('i8') attribs = self._get_attributes_dict() if 'freq' in attribs: attribs['freq'] = None if 'tz' in attribs: attribs['tz'] = None - result = self._shallow_copy(result, **attribs) - - # reconvert to local tz - if getattr(self, 'tz', None) is not None: - result = result.tz_localize(self.tz) - return result + return self._ensure_localized( + self._shallow_copy(result, **attribs)) @Appender(_round_doc % "round") def round(self, freq, *args, **kwargs): @@ -161,6 +156,29 @@ def _evaluate_compare(self, other, op): except TypeError: return result + def _ensure_localized(self, result): + """ + ensure that we are re-localized + + This is for compat as we can then call this on all datetimelike + indexes generally (ignored for Period/Timedelta) + + Parameters + ---------- + result : DatetimeIndex / i8 ndarray + + Returns + ------- + localized DTI + """ + + # reconvert to local tz + if getattr(self, 'tz', None) is not None: + if not isinstance(result, com.ABCIndexClass): + result = self._simple_new(result) + result = result.tz_localize(self.tz) + return result + @property def _box_func(self): """ @@ -189,8 +207,17 @@ def __contains__(self, key): return False def __getitem__(self, key): + """ + This getitem defers to the underlying array, which by-definition can + only handle list-likes, slices, and integer scalars + """ + + is_int = is_integer(key) + if lib.isscalar(key) and not is_int: + raise ValueError + getitem = self._data.__getitem__ - if lib.isscalar(key): + if is_int: val = getitem(key) return self._box_func(val) else: @@ -718,6 +745,27 @@ def repeat(self, repeats, *args, **kwargs): nv.validate_repeat(args, kwargs) return self._shallow_copy(self.values.repeat(repeats), freq=None) + def where(self, cond, other=None): + """ + .. versionadded:: 0.18.2 + + Return an Index of same shape as self and whose corresponding + entries are from self where cond is True and otherwise are from + other. + + Parameters + ---------- + cond : boolean same length as self + other : scalar, or array-like + """ + other = _ensure_datetimelike_to_i8(other) + values = _ensure_datetimelike_to_i8(self) + result = np.where(cond, values, other).astype('i8') + + result = self._ensure_localized(result) + return self._shallow_copy(result, + **self._get_attributes_dict()) + def summary(self, name=None): """ return a summarized representation @@ -739,3 +787,19 @@ def summary(self, name=None): # display as values, not quoted result = result.replace("'", "") return result + + +def _ensure_datetimelike_to_i8(other): + """ helper for coercing an input scalar or array to i8 """ + if lib.isscalar(other) and com.isnull(other): + other = tslib.iNaT + elif isinstance(other, com.ABCIndexClass): + + # convert tz if needed + if getattr(other, 'tz', None) is not None: + other = other.tz_localize(None).asi8 + else: + other = other.asi8 + else: + other = np.array(other, copy=False).view('i8') + return other diff --git a/pandas/tseries/converter.py b/pandas/tseries/converter.py index 8ccfdfa05e9b5..78b185ae8cf31 100644 --- a/pandas/tseries/converter.py +++ b/pandas/tseries/converter.py @@ -23,6 +23,24 @@ from pandas.tseries.frequencies import FreqGroup from pandas.tseries.period import Period, PeriodIndex +# constants +HOURS_PER_DAY = 24. +MIN_PER_HOUR = 60. +SEC_PER_MIN = 60. + +SEC_PER_HOUR = SEC_PER_MIN * MIN_PER_HOUR +SEC_PER_DAY = SEC_PER_HOUR * HOURS_PER_DAY + +MUSEC_PER_DAY = 1e6 * SEC_PER_DAY + + +def _mpl_le_2_0_0(): + try: + import matplotlib + return matplotlib.compare_versions('2.0.0', matplotlib.__version__) + except ImportError: + return False + def register(): units.registry[lib.Timestamp] = DatetimeConverter() @@ -221,6 +239,13 @@ def __init__(self, locator, tz=None, defaultfmt='%Y-%m-%d'): if self._tz is dates.UTC: self._tz._utcoffset = self._tz.utcoffset(None) + # For mpl > 2.0 the format strings are controlled via rcparams + # so do not mess with them. For mpl < 2.0 change the second + # break point and add a musec break point + if _mpl_le_2_0_0(): + self.scaled[1. / SEC_PER_DAY] = '%H:%M:%S' + self.scaled[1. / MUSEC_PER_DAY] = '%H:%M:%S.%f' + class PandasAutoDateLocator(dates.AutoDateLocator): diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 25d3490873542..83ab5d2a2bce4 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -6,16 +6,17 @@ from datetime import timedelta import numpy as np from pandas.core.base import _shared_docs -from pandas.core.common import (_NS_DTYPE, _INT64_DTYPE, - _values_from_object, _maybe_box, - is_object_dtype, is_datetime64_dtype, - is_datetimetz, is_dtype_equal, - ABCSeries, is_integer, is_float, - DatetimeTZDtype, PerformanceWarning) +from pandas.core.common import (_INT64_DTYPE, _NS_DTYPE, _maybe_box, + _values_from_object, ABCSeries, + DatetimeTZDtype, PerformanceWarning, + is_datetimetz, is_datetime64_dtype, + is_datetime64_ns_dtype, is_dtype_equal, + is_float, is_integer, is_integer_dtype, + is_object_dtype, is_string_dtype) from pandas.core.index import Index, Int64Index, Float64Index +from pandas.indexes.base import _index_shared_docs import pandas.compat as compat -from pandas.compat import u from pandas.tseries.frequencies import ( to_offset, get_period_alias, Resolution) @@ -814,8 +815,7 @@ def _add_offset(self, offset): "or DatetimeIndex", PerformanceWarning) return self.astype('O') + offset - def _format_native_types(self, na_rep=u('NaT'), - date_format=None, **kwargs): + def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): from pandas.formats.format import _get_format_datetime64_from_values format = _get_format_datetime64_from_values(self, date_format) @@ -827,19 +827,24 @@ def _format_native_types(self, na_rep=u('NaT'), def to_datetime(self, dayfirst=False): return self.copy() - def astype(self, dtype): + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): dtype = np.dtype(dtype) - if dtype == np.object_: + if is_object_dtype(dtype): return self.asobject - elif dtype == _INT64_DTYPE: - return self.asi8.copy() - elif dtype == _NS_DTYPE and self.tz is not None: - return self.tz_convert('UTC').tz_localize(None) - elif dtype == str: + elif is_integer_dtype(dtype): + return Index(self.values.astype('i8', copy=copy), name=self.name, + dtype='i8') + elif is_datetime64_ns_dtype(dtype): + if self.tz is not None: + return self.tz_convert('UTC').tz_localize(None) + elif copy is True: + return self.copy() + return self + elif is_string_dtype(dtype): return Index(self.format(), name=self.name, dtype=object) - else: # pragma: no cover - raise ValueError('Cannot cast DatetimeIndex to dtype %s' % dtype) + raise ValueError('Cannot cast DatetimeIndex to dtype %s' % dtype) def _get_time_micros(self): utc = _utc() diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index fb91185746181..c3deee5f6dab2 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -15,11 +15,12 @@ _quarter_to_myear) from pandas.core.base import _shared_docs +from pandas.indexes.base import _index_shared_docs import pandas.core.common as com -from pandas.core.common import (isnull, _INT64_DTYPE, _maybe_box, - _values_from_object, ABCSeries, - is_integer, is_float, is_object_dtype) +from pandas.core.common import ( + _maybe_box, _values_from_object, ABCSeries, is_float, is_integer, + is_integer_dtype, is_object_dtype, isnull) from pandas import compat from pandas.compat.numpy import function as nv from pandas.util.decorators import Appender, cache_readonly, Substitution @@ -271,10 +272,15 @@ def _from_arraylike(cls, data, freq, tz): @classmethod def _simple_new(cls, values, name=None, freq=None, **kwargs): - if not getattr(values, 'dtype', None): + + if not com.is_integer_dtype(values): values = np.array(values, copy=False) - if is_object_dtype(values): - return PeriodIndex(values, name=name, freq=freq, **kwargs) + if (len(values) > 0 and com.is_float_dtype(values)): + raise TypeError("PeriodIndex can't take floats") + else: + return PeriodIndex(values, name=name, freq=freq, **kwargs) + + values = np.array(values, dtype='int64', copy=False) result = object.__new__(cls) result._data = values @@ -381,12 +387,14 @@ def asof_locs(self, where, mask): def _array_values(self): return self.asobject - def astype(self, dtype): + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): dtype = np.dtype(dtype) - if dtype == np.object_: - return Index(np.array(list(self), dtype), dtype) - elif dtype == _INT64_DTYPE: - return Index(self.values, dtype) + if is_object_dtype(dtype): + return self.asobject + elif is_integer_dtype(dtype): + return Index(self.values.astype('i8', copy=copy), name=self.name, + dtype='i8') raise ValueError('Cannot cast PeriodIndex to dtype %s' % dtype) @Substitution(klass='PeriodIndex', value='key') diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index a0f08a93a07d9..8d6955ab43711 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -1,6 +1,7 @@ from datetime import timedelta import numpy as np import warnings +import copy import pandas as pd from pandas.core.base import AbstractMethodError, GroupByMixin @@ -15,9 +16,12 @@ from pandas.tseries.period import PeriodIndex, period_range import pandas.core.common as com import pandas.core.algorithms as algos + import pandas.compat as compat +from pandas.compat.numpy import function as nv from pandas.lib import Timestamp +from pandas._period import IncompatibleFrequency import pandas.lib as lib import pandas.tslib as tslib @@ -479,7 +483,7 @@ def asfreq(self): """ return self._upsample('asfreq') - def std(self, ddof=1): + def std(self, ddof=1, *args, **kwargs): """ Compute standard deviation of groups, excluding missing values @@ -488,9 +492,10 @@ def std(self, ddof=1): ddof : integer, default 1 degrees of freedom """ + nv.validate_resampler_func('std', args, kwargs) return self._downsample('std', ddof=ddof) - def var(self, ddof=1): + def var(self, ddof=1, *args, **kwargs): """ Compute variance of groups, excluding missing values @@ -499,6 +504,7 @@ def var(self, ddof=1): ddof : integer, default 1 degrees of freedom """ + nv.validate_resampler_func('var', args, kwargs) return self._downsample('var', ddof=ddof) Resampler._deprecated_valids += dir(Resampler) @@ -506,7 +512,8 @@ def var(self, ddof=1): for method in ['min', 'max', 'first', 'last', 'sum', 'mean', 'sem', 'median', 'prod', 'ohlc']: - def f(self, _method=method): + def f(self, _method=method, *args, **kwargs): + nv.validate_resampler_func(_method, args, kwargs) return self._downsample(_method) f.__doc__ = getattr(GroupBy, method).__doc__ setattr(Resampler, method, f) @@ -592,7 +599,7 @@ def __init__(self, obj, *args, **kwargs): self._groupby = groupby self._groupby.mutated = True self._groupby.grouper.mutated = True - self.groupby = parent.groupby + self.groupby = copy.copy(parent.groupby) def _apply(self, f, **kwargs): """ @@ -789,16 +796,17 @@ def _downsample(self, how, **kwargs): ax = self.ax new_index = self._get_new_index() - if len(new_index) == 0: - return self._wrap_result(self._selected_obj.reindex(new_index)) # Start vs. end of period memb = ax.asfreq(self.freq, how=self.convention) if is_subperiod(ax.freq, self.freq): # Downsampling - rng = np.arange(memb.values[0], memb.values[-1] + 1) - bins = memb.searchsorted(rng, side='right') + if len(new_index) == 0: + bins = [] + else: + rng = np.arange(memb.values[0], memb.values[-1] + 1) + bins = memb.searchsorted(rng, side='right') grouper = BinGrouper(bins, new_index) return self._groupby_and_aggregate(how, grouper=grouper) elif is_superperiod(ax.freq, self.freq): @@ -806,10 +814,9 @@ def _downsample(self, how, **kwargs): elif ax.freq == self.freq: return self.asfreq() - raise ValueError('Frequency {axfreq} cannot be ' - 'resampled to {freq}'.format( - axfreq=ax.freq, - freq=self.freq)) + raise IncompatibleFrequency( + 'Frequency {} cannot be resampled to {}, as they are not ' + 'sub or super periods'.format(ax.freq, self.freq)) def _upsample(self, method, limit=None): """ @@ -832,9 +839,6 @@ def _upsample(self, method, limit=None): obj = self.obj new_index = self._get_new_index() - if len(new_index) == 0: - return self._wrap_result(self._selected_obj.reindex(new_index)) - # Start vs. end of period memb = ax.asfreq(self.freq, how=self.convention) @@ -908,8 +912,7 @@ def get_resampler_for_grouping(groupby, rule, how=None, fill_method=None, return _maybe_process_deprecations(r, how=how, fill_method=fill_method, - limit=limit, - **kwargs) + limit=limit) class TimeGrouper(Grouper): diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index 7d731c28c0f88..3e12cf14e7485 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -2,15 +2,17 @@ from datetime import timedelta import numpy as np -from pandas.core.common import (ABCSeries, _TD_DTYPE, _INT64_DTYPE, - _maybe_box, +from pandas.core.common import (ABCSeries, _TD_DTYPE, _maybe_box, _values_from_object, isnull, - is_integer, is_float) + is_integer, is_float, is_integer_dtype, + is_object_dtype, is_timedelta64_dtype, + is_timedelta64_ns_dtype) from pandas.core.index import Index, Int64Index import pandas.compat as compat from pandas.compat import u from pandas.tseries.frequencies import to_offset from pandas.core.base import _shared_docs +from pandas.indexes.base import _index_shared_docs import pandas.core.common as com import pandas.types.concat as _concat from pandas.util.decorators import Appender, Substitution @@ -435,28 +437,28 @@ def to_pytimedelta(self): """ return tslib.ints_to_pytimedelta(self.asi8) - def astype(self, dtype): + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): dtype = np.dtype(dtype) - if dtype == np.object_: + if is_object_dtype(dtype): return self.asobject - elif dtype == _INT64_DTYPE: - return self.asi8.copy() - elif dtype == _TD_DTYPE: + elif is_timedelta64_ns_dtype(dtype): + if copy is True: + return self.copy() return self - elif dtype.kind == 'm': - + elif is_timedelta64_dtype(dtype): # return an index (essentially this is division) - result = self.values.astype(dtype) + result = self.values.astype(dtype, copy=copy) if self.hasnans: return Index(self._maybe_mask_results(result, convert='float64'), name=self.name) - return Index(result.astype('i8'), name=self.name) - - else: # pragma: no cover - raise ValueError('Cannot cast TimedeltaIndex to dtype %s' % dtype) + elif is_integer_dtype(dtype): + return Index(self.values.astype('i8', copy=copy), dtype='i8', + name=self.name) + raise ValueError('Cannot cast TimedeltaIndex to dtype %s' % dtype) def union(self, other): """ diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 2077409f4afec..7077a23d5abcb 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -50,39 +50,6 @@ def test_ops_properties_basic(self): self.assertEqual(s.day, 10) self.assertRaises(AttributeError, lambda: s.weekday) - def test_astype_str(self): - # test astype string - #10442 - result = date_range('2012-01-01', periods=4, - name='test_name').astype(str) - expected = Index(['2012-01-01', '2012-01-02', '2012-01-03', - '2012-01-04'], name='test_name', dtype=object) - tm.assert_index_equal(result, expected) - - # test astype string with tz and name - result = date_range('2012-01-01', periods=3, name='test_name', - tz='US/Eastern').astype(str) - expected = Index(['2012-01-01 00:00:00-05:00', - '2012-01-02 00:00:00-05:00', - '2012-01-03 00:00:00-05:00'], - name='test_name', dtype=object) - tm.assert_index_equal(result, expected) - - # test astype string with freqH and name - result = date_range('1/1/2011', periods=3, freq='H', - name='test_name').astype(str) - expected = Index(['2011-01-01 00:00:00', '2011-01-01 01:00:00', - '2011-01-01 02:00:00'], - name='test_name', dtype=object) - tm.assert_index_equal(result, expected) - - # test astype string with freqH and timezone - result = date_range('3/6/2012 00:00', periods=2, freq='H', - tz='Europe/London', name='test_name').astype(str) - expected = Index(['2012-03-06 00:00:00+00:00', - '2012-03-06 01:00:00+00:00'], - dtype=object, name='test_name') - tm.assert_index_equal(result, expected) - def test_asobject_tolist(self): idx = pd.date_range(start='2013-01-01', periods=4, freq='M', name='idx') @@ -95,7 +62,7 @@ def test_asobject_tolist(self): self.assertTrue(isinstance(result, Index)) self.assertEqual(result.dtype, object) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) self.assertEqual(idx.tolist(), expected_list) @@ -109,7 +76,7 @@ def test_asobject_tolist(self): result = idx.asobject self.assertTrue(isinstance(result, Index)) self.assertEqual(result.dtype, object) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) self.assertEqual(idx.tolist(), expected_list) @@ -122,7 +89,7 @@ def test_asobject_tolist(self): result = idx.asobject self.assertTrue(isinstance(result, Index)) self.assertEqual(result.dtype, object) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) self.assertEqual(idx.tolist(), expected_list) @@ -759,7 +726,7 @@ def test_asobject_tolist(self): self.assertTrue(isinstance(result, Index)) self.assertEqual(result.dtype, object) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) self.assertEqual(idx.tolist(), expected_list) @@ -771,7 +738,7 @@ def test_asobject_tolist(self): result = idx.asobject self.assertTrue(isinstance(result, Index)) self.assertEqual(result.dtype, object) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) self.assertEqual(idx.tolist(), expected_list) @@ -1522,7 +1489,7 @@ def test_asobject_tolist(self): result = idx.asobject self.assertTrue(isinstance(result, Index)) self.assertEqual(result.dtype, object) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) self.assertEqual(idx.tolist(), expected_list) diff --git a/pandas/tseries/tests/test_bin_groupby.py b/pandas/tseries/tests/test_bin_groupby.py new file mode 100644 index 0000000000000..6b6c468b7c391 --- /dev/null +++ b/pandas/tseries/tests/test_bin_groupby.py @@ -0,0 +1,151 @@ +# -*- coding: utf-8 -*- + +from numpy import nan +import numpy as np + +from pandas import Index, isnull +from pandas.util.testing import assert_almost_equal +import pandas.util.testing as tm +import pandas.lib as lib +import pandas.algos as algos +from pandas.core import common as com + + +def test_series_grouper(): + from pandas import Series + obj = Series(np.random.randn(10)) + dummy = obj[:0] + + labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64) + + grouper = lib.SeriesGrouper(obj, np.mean, labels, 2, dummy) + result, counts = grouper.get_result() + + expected = np.array([obj[3:6].mean(), obj[6:].mean()]) + assert_almost_equal(result, expected) + + exp_counts = np.array([3, 4], dtype=np.int64) + assert_almost_equal(counts, exp_counts) + + +def test_series_bin_grouper(): + from pandas import Series + obj = Series(np.random.randn(10)) + dummy = obj[:0] + + bins = np.array([3, 6]) + + grouper = lib.SeriesBinGrouper(obj, np.mean, bins, dummy) + result, counts = grouper.get_result() + + expected = np.array([obj[:3].mean(), obj[3:6].mean(), obj[6:].mean()]) + assert_almost_equal(result, expected) + + exp_counts = np.array([3, 3, 4], dtype=np.int64) + assert_almost_equal(counts, exp_counts) + + +class TestBinGroupers(tm.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + self.obj = np.random.randn(10, 1) + self.labels = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 2], dtype=np.int64) + self.bins = np.array([3, 6], dtype=np.int64) + + def test_generate_bins(self): + from pandas.core.groupby import generate_bins_generic + values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64) + binner = np.array([0, 3, 6, 9], dtype=np.int64) + + for func in [lib.generate_bins_dt64, generate_bins_generic]: + bins = func(values, binner, closed='left') + assert ((bins == np.array([2, 5, 6])).all()) + + bins = func(values, binner, closed='right') + assert ((bins == np.array([3, 6, 6])).all()) + + for func in [lib.generate_bins_dt64, generate_bins_generic]: + values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64) + binner = np.array([0, 3, 6], dtype=np.int64) + + bins = func(values, binner, closed='right') + assert ((bins == np.array([3, 6])).all()) + + self.assertRaises(ValueError, generate_bins_generic, values, [], + 'right') + self.assertRaises(ValueError, generate_bins_generic, values[:0], + binner, 'right') + + self.assertRaises(ValueError, generate_bins_generic, values, [4], + 'right') + self.assertRaises(ValueError, generate_bins_generic, values, [-3, -1], + 'right') + + +def test_group_ohlc(): + def _check(dtype): + obj = np.array(np.random.randn(20), dtype=dtype) + + bins = np.array([6, 12, 20]) + out = np.zeros((3, 4), dtype) + counts = np.zeros(len(out), dtype=np.int64) + labels = com._ensure_int64(np.repeat(np.arange(3), + np.diff(np.r_[0, bins]))) + + func = getattr(algos, 'group_ohlc_%s' % dtype) + func(out, counts, obj[:, None], labels) + + def _ohlc(group): + if isnull(group).all(): + return np.repeat(nan, 4) + return [group[0], group.max(), group.min(), group[-1]] + + expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), + _ohlc(obj[12:])]) + + assert_almost_equal(out, expected) + tm.assert_numpy_array_equal(counts, + np.array([6, 6, 8], dtype=np.int64)) + + obj[:6] = nan + func(out, counts, obj[:, None], labels) + expected[0] = nan + assert_almost_equal(out, expected) + + _check('float32') + _check('float64') + + +class TestMoments(tm.TestCase): + pass + + +class TestReducer(tm.TestCase): + def test_int_index(self): + from pandas.core.series import Series + + arr = np.random.randn(100, 4) + result = lib.reduce(arr, np.sum, labels=Index(np.arange(4))) + expected = arr.sum(0) + assert_almost_equal(result, expected) + + result = lib.reduce(arr, np.sum, axis=1, labels=Index(np.arange(100))) + expected = arr.sum(1) + assert_almost_equal(result, expected) + + dummy = Series(0., index=np.arange(100)) + result = lib.reduce(arr, np.sum, dummy=dummy, + labels=Index(np.arange(4))) + expected = arr.sum(0) + assert_almost_equal(result, expected) + + dummy = Series(0., index=np.arange(4)) + result = lib.reduce(arr, np.sum, axis=1, dummy=dummy, + labels=Index(np.arange(100))) + expected = arr.sum(1) + assert_almost_equal(result, expected) + + result = lib.reduce(arr, np.sum, axis=1, dummy=dummy, + labels=Index(np.arange(100))) + assert_almost_equal(result, expected) diff --git a/pandas/tseries/tests/test_converter.py b/pandas/tseries/tests/test_converter.py index f2c20f7d3111d..ceb8660efb9cd 100644 --- a/pandas/tseries/tests/test_converter.py +++ b/pandas/tseries/tests/test_converter.py @@ -3,7 +3,6 @@ import nose import numpy as np -from numpy.testing import assert_almost_equal as np_assert_almost_equal from pandas import Timestamp, Period from pandas.compat import u import pandas.util.testing as tm @@ -69,14 +68,14 @@ def test_conversion_float(self): rs = self.dtc.convert( Timestamp('2012-1-1 01:02:03', tz='UTC'), None, None) xp = converter.dates.date2num(Timestamp('2012-1-1 01:02:03', tz='UTC')) - np_assert_almost_equal(rs, xp, decimals) + tm.assert_almost_equal(rs, xp, decimals) rs = self.dtc.convert( Timestamp('2012-1-1 09:02:03', tz='Asia/Hong_Kong'), None, None) - np_assert_almost_equal(rs, xp, decimals) + tm.assert_almost_equal(rs, xp, decimals) rs = self.dtc.convert(datetime(2012, 1, 1, 1, 2, 3), None, None) - np_assert_almost_equal(rs, xp, decimals) + tm.assert_almost_equal(rs, xp, decimals) def test_time_formatter(self): self.tc(90000) @@ -88,7 +87,7 @@ def test_dateindex_conversion(self): dateindex = tm.makeDateIndex(k=10, freq=freq) rs = self.dtc.convert(dateindex, None, None) xp = converter.dates.date2num(dateindex._mpl_repr()) - np_assert_almost_equal(rs, xp, decimals) + tm.assert_almost_equal(rs, xp, decimals) def test_resolution(self): def _assert_less(ts1, ts2): diff --git a/pandas/tseries/tests/test_daterange.py b/pandas/tseries/tests/test_daterange.py index 6e572289a3cae..6ad33b6b973de 100644 --- a/pandas/tseries/tests/test_daterange.py +++ b/pandas/tseries/tests/test_daterange.py @@ -25,15 +25,16 @@ def eq_gen_range(kwargs, expected): class TestGenRangeGeneration(tm.TestCase): + def test_generate(self): rng1 = list(generate_range(START, END, offset=datetools.bday)) rng2 = list(generate_range(START, END, time_rule='B')) - self.assert_numpy_array_equal(rng1, rng2) + self.assertEqual(rng1, rng2) def test_generate_cday(self): rng1 = list(generate_range(START, END, offset=datetools.cday)) rng2 = list(generate_range(START, END, time_rule='C')) - self.assert_numpy_array_equal(rng1, rng2) + self.assertEqual(rng1, rng2) def test_1(self): eq_gen_range(dict(start=datetime(2009, 3, 25), periods=2), @@ -68,8 +69,8 @@ def test_precision_finer_than_offset(self): freq='Q-DEC', tz=None) expected2 = DatetimeIndex(expected2_list, dtype='datetime64[ns]', freq='W-SUN', tz=None) - self.assertTrue(result1.equals(expected1)) - self.assertTrue(result2.equals(expected2)) + self.assert_index_equal(result1, expected1) + self.assert_index_equal(result2, expected2) class TestDateRange(tm.TestCase): @@ -140,7 +141,7 @@ def test_comparison(self): def test_copy(self): cp = self.rng.copy() repr(cp) - self.assertTrue(cp.equals(self.rng)) + self.assert_index_equal(cp, self.rng) def test_repr(self): # only really care that it works @@ -148,7 +149,9 @@ def test_repr(self): def test_getitem(self): smaller = self.rng[:5] - self.assert_numpy_array_equal(smaller, self.rng.view(np.ndarray)[:5]) + exp = DatetimeIndex(self.rng.view(np.ndarray)[:5]) + self.assert_index_equal(smaller, exp) + self.assertEqual(smaller.offset, self.rng.offset) sliced = self.rng[::5] @@ -211,7 +214,7 @@ def test_union(self): tm.assertIsInstance(the_union, DatetimeIndex) # order does not matter - self.assert_numpy_array_equal(right.union(left), the_union) + tm.assert_index_equal(right.union(left), the_union) # overlapping, but different offset rng = date_range(START, END, freq=datetools.bmonthEnd) @@ -256,13 +259,13 @@ def test_union_not_cacheable(self): rng1 = rng[10:] rng2 = rng[:25] the_union = rng1.union(rng2) - self.assertTrue(the_union.equals(rng)) + self.assert_index_equal(the_union, rng) rng1 = rng[10:] rng2 = rng[15:35] the_union = rng1.union(rng2) expected = rng[10:] - self.assertTrue(the_union.equals(expected)) + self.assert_index_equal(the_union, expected) def test_intersection(self): rng = date_range('1/1/2000', periods=50, freq=datetools.Minute()) @@ -270,24 +273,24 @@ def test_intersection(self): rng2 = rng[:25] the_int = rng1.intersection(rng2) expected = rng[10:25] - self.assertTrue(the_int.equals(expected)) + self.assert_index_equal(the_int, expected) tm.assertIsInstance(the_int, DatetimeIndex) self.assertEqual(the_int.offset, rng.offset) the_int = rng1.intersection(rng2.view(DatetimeIndex)) - self.assertTrue(the_int.equals(expected)) + self.assert_index_equal(the_int, expected) # non-overlapping the_int = rng[:10].intersection(rng[10:]) expected = DatetimeIndex([]) - self.assertTrue(the_int.equals(expected)) + self.assert_index_equal(the_int, expected) def test_intersection_bug(self): # GH #771 a = bdate_range('11/30/2011', '12/31/2011') b = bdate_range('12/10/2011', '12/20/2011') result = a.intersection(b) - self.assertTrue(result.equals(b)) + self.assert_index_equal(result, b) def test_summary(self): self.rng.summary() @@ -364,7 +367,7 @@ def test_range_bug(self): start = datetime(2011, 1, 1) exp_values = [start + i * offset for i in range(5)] - self.assert_numpy_array_equal(result, DatetimeIndex(exp_values)) + tm.assert_index_equal(result, DatetimeIndex(exp_values)) def test_range_tz_pytz(self): # GH 2906 @@ -494,8 +497,8 @@ def test_range_closed(self): if begin == closed[0]: expected_right = closed[1:] - self.assertTrue(expected_left.equals(left)) - self.assertTrue(expected_right.equals(right)) + self.assert_index_equal(expected_left, left) + self.assert_index_equal(expected_right, right) def test_range_closed_with_tz_aware_start_end(self): # GH12409 @@ -514,8 +517,8 @@ def test_range_closed_with_tz_aware_start_end(self): if begin == closed[0]: expected_right = closed[1:] - self.assertTrue(expected_left.equals(left)) - self.assertTrue(expected_right.equals(right)) + self.assert_index_equal(expected_left, left) + self.assert_index_equal(expected_right, right) # test with default frequency, UTC begin = Timestamp('2011/1/1', tz='UTC') @@ -546,9 +549,9 @@ def test_range_closed_boundary(self): expected_right = both_boundary[1:] expected_left = both_boundary[:-1] - self.assertTrue(right_boundary.equals(expected_right)) - self.assertTrue(left_boundary.equals(expected_left)) - self.assertTrue(both_boundary.equals(expected_both)) + self.assert_index_equal(right_boundary, expected_right) + self.assert_index_equal(left_boundary, expected_left) + self.assert_index_equal(both_boundary, expected_both) def test_years_only(self): # GH 6961 @@ -570,8 +573,8 @@ def test_freq_divides_end_in_nanos(self): '2005-01-13 15:45:00'], dtype='datetime64[ns]', freq='345T', tz=None) - self.assertTrue(result_1.equals(expected_1)) - self.assertTrue(result_2.equals(expected_2)) + self.assert_index_equal(result_1, expected_1) + self.assert_index_equal(result_2, expected_2) class TestCustomDateRange(tm.TestCase): @@ -613,7 +616,7 @@ def test_comparison(self): def test_copy(self): cp = self.rng.copy() repr(cp) - self.assertTrue(cp.equals(self.rng)) + self.assert_index_equal(cp, self.rng) def test_repr(self): # only really care that it works @@ -621,7 +624,8 @@ def test_repr(self): def test_getitem(self): smaller = self.rng[:5] - self.assert_numpy_array_equal(smaller, self.rng.view(np.ndarray)[:5]) + exp = DatetimeIndex(self.rng.view(np.ndarray)[:5]) + self.assert_index_equal(smaller, exp) self.assertEqual(smaller.offset, self.rng.offset) sliced = self.rng[::5] @@ -686,7 +690,7 @@ def test_union(self): tm.assertIsInstance(the_union, DatetimeIndex) # order does not matter - self.assert_numpy_array_equal(right.union(left), the_union) + self.assert_index_equal(right.union(left), the_union) # overlapping, but different offset rng = date_range(START, END, freq=datetools.bmonthEnd) @@ -731,7 +735,7 @@ def test_intersection_bug(self): a = cdate_range('11/30/2011', '12/31/2011') b = cdate_range('12/10/2011', '12/20/2011') result = a.intersection(b) - self.assertTrue(result.equals(b)) + self.assert_index_equal(result, b) def test_summary(self): self.rng.summary() @@ -783,25 +787,25 @@ def test_daterange_bug_456(self): def test_cdaterange(self): rng = cdate_range('2013-05-01', periods=3) xp = DatetimeIndex(['2013-05-01', '2013-05-02', '2013-05-03']) - self.assertTrue(xp.equals(rng)) + self.assert_index_equal(xp, rng) def test_cdaterange_weekmask(self): rng = cdate_range('2013-05-01', periods=3, weekmask='Sun Mon Tue Wed Thu') xp = DatetimeIndex(['2013-05-01', '2013-05-02', '2013-05-05']) - self.assertTrue(xp.equals(rng)) + self.assert_index_equal(xp, rng) def test_cdaterange_holidays(self): rng = cdate_range('2013-05-01', periods=3, holidays=['2013-05-01']) xp = DatetimeIndex(['2013-05-02', '2013-05-03', '2013-05-06']) - self.assertTrue(xp.equals(rng)) + self.assert_index_equal(xp, rng) def test_cdaterange_weekmask_and_holidays(self): rng = cdate_range('2013-05-01', periods=3, weekmask='Sun Mon Tue Wed Thu', holidays=['2013-05-01']) xp = DatetimeIndex(['2013-05-02', '2013-05-05', '2013-05-06']) - self.assertTrue(xp.equals(rng)) + self.assert_index_equal(xp, rng) if __name__ == '__main__': diff --git a/pandas/tseries/tests/test_offsets.py b/pandas/tseries/tests/test_offsets.py index 0e91e396965fa..ec88acc421cdb 100644 --- a/pandas/tseries/tests/test_offsets.py +++ b/pandas/tseries/tests/test_offsets.py @@ -4551,7 +4551,7 @@ def test_all_offset_classes(self): for offset, test_values in iteritems(tests): first = Timestamp(test_values[0], tz='US/Eastern') + offset() second = Timestamp(test_values[1], tz='US/Eastern') - self.assertEqual(first, second, str(offset)) + self.assertEqual(first, second, msg=str(offset)) if __name__ == '__main__': diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 740a158c52f87..de23306c80b71 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -8,9 +8,7 @@ from datetime import datetime, date, timedelta -from numpy.ma.testutils import assert_equal - -from pandas import Timestamp +from pandas import Timestamp, _period from pandas.tseries.frequencies import MONTHS, DAYS, _period_code_map from pandas.tseries.period import Period, PeriodIndex, period_range from pandas.tseries.index import DatetimeIndex, date_range, Index @@ -28,8 +26,6 @@ from pandas import (Series, DataFrame, _np_version_under1p9, _np_version_under1p12) from pandas import tslib -from pandas.util.testing import (assert_index_equal, assert_series_equal, - assert_almost_equal, assertRaisesRegexp) import pandas.util.testing as tm @@ -492,8 +488,8 @@ def test_sub_delta(self): result = left - right self.assertEqual(result, 4) - self.assertRaises(ValueError, left.__sub__, - Period('2007-01', freq='M')) + with self.assertRaises(period.IncompatibleFrequency): + left - Period('2007-01', freq='M') def test_to_timestamp(self): p = Period('1982', freq='A') @@ -625,7 +621,7 @@ def _ex(*args): def test_properties_annually(self): # Test properties on Periods with annually frequency. a_date = Period(freq='A', year=2007) - assert_equal(a_date.year, 2007) + self.assertEqual(a_date.year, 2007) def test_properties_quarterly(self): # Test properties on Periods with daily frequency. @@ -635,78 +631,78 @@ def test_properties_quarterly(self): # for x in range(3): for qd in (qedec_date, qejan_date, qejun_date): - assert_equal((qd + x).qyear, 2007) - assert_equal((qd + x).quarter, x + 1) + self.assertEqual((qd + x).qyear, 2007) + self.assertEqual((qd + x).quarter, x + 1) def test_properties_monthly(self): # Test properties on Periods with daily frequency. m_date = Period(freq='M', year=2007, month=1) for x in range(11): m_ival_x = m_date + x - assert_equal(m_ival_x.year, 2007) + self.assertEqual(m_ival_x.year, 2007) if 1 <= x + 1 <= 3: - assert_equal(m_ival_x.quarter, 1) + self.assertEqual(m_ival_x.quarter, 1) elif 4 <= x + 1 <= 6: - assert_equal(m_ival_x.quarter, 2) + self.assertEqual(m_ival_x.quarter, 2) elif 7 <= x + 1 <= 9: - assert_equal(m_ival_x.quarter, 3) + self.assertEqual(m_ival_x.quarter, 3) elif 10 <= x + 1 <= 12: - assert_equal(m_ival_x.quarter, 4) - assert_equal(m_ival_x.month, x + 1) + self.assertEqual(m_ival_x.quarter, 4) + self.assertEqual(m_ival_x.month, x + 1) def test_properties_weekly(self): # Test properties on Periods with daily frequency. w_date = Period(freq='W', year=2007, month=1, day=7) # - assert_equal(w_date.year, 2007) - assert_equal(w_date.quarter, 1) - assert_equal(w_date.month, 1) - assert_equal(w_date.week, 1) - assert_equal((w_date - 1).week, 52) - assert_equal(w_date.days_in_month, 31) - assert_equal(Period(freq='W', year=2012, - month=2, day=1).days_in_month, 29) + self.assertEqual(w_date.year, 2007) + self.assertEqual(w_date.quarter, 1) + self.assertEqual(w_date.month, 1) + self.assertEqual(w_date.week, 1) + self.assertEqual((w_date - 1).week, 52) + self.assertEqual(w_date.days_in_month, 31) + self.assertEqual(Period(freq='W', year=2012, + month=2, day=1).days_in_month, 29) def test_properties_weekly_legacy(self): # Test properties on Periods with daily frequency. with tm.assert_produces_warning(FutureWarning): w_date = Period(freq='WK', year=2007, month=1, day=7) # - assert_equal(w_date.year, 2007) - assert_equal(w_date.quarter, 1) - assert_equal(w_date.month, 1) - assert_equal(w_date.week, 1) - assert_equal((w_date - 1).week, 52) - assert_equal(w_date.days_in_month, 31) + self.assertEqual(w_date.year, 2007) + self.assertEqual(w_date.quarter, 1) + self.assertEqual(w_date.month, 1) + self.assertEqual(w_date.week, 1) + self.assertEqual((w_date - 1).week, 52) + self.assertEqual(w_date.days_in_month, 31) with tm.assert_produces_warning(FutureWarning): exp = Period(freq='WK', year=2012, month=2, day=1) - assert_equal(exp.days_in_month, 29) + self.assertEqual(exp.days_in_month, 29) def test_properties_daily(self): # Test properties on Periods with daily frequency. b_date = Period(freq='B', year=2007, month=1, day=1) # - assert_equal(b_date.year, 2007) - assert_equal(b_date.quarter, 1) - assert_equal(b_date.month, 1) - assert_equal(b_date.day, 1) - assert_equal(b_date.weekday, 0) - assert_equal(b_date.dayofyear, 1) - assert_equal(b_date.days_in_month, 31) - assert_equal(Period(freq='B', year=2012, - month=2, day=1).days_in_month, 29) + self.assertEqual(b_date.year, 2007) + self.assertEqual(b_date.quarter, 1) + self.assertEqual(b_date.month, 1) + self.assertEqual(b_date.day, 1) + self.assertEqual(b_date.weekday, 0) + self.assertEqual(b_date.dayofyear, 1) + self.assertEqual(b_date.days_in_month, 31) + self.assertEqual(Period(freq='B', year=2012, + month=2, day=1).days_in_month, 29) # d_date = Period(freq='D', year=2007, month=1, day=1) # - assert_equal(d_date.year, 2007) - assert_equal(d_date.quarter, 1) - assert_equal(d_date.month, 1) - assert_equal(d_date.day, 1) - assert_equal(d_date.weekday, 0) - assert_equal(d_date.dayofyear, 1) - assert_equal(d_date.days_in_month, 31) - assert_equal(Period(freq='D', year=2012, month=2, - day=1).days_in_month, 29) + self.assertEqual(d_date.year, 2007) + self.assertEqual(d_date.quarter, 1) + self.assertEqual(d_date.month, 1) + self.assertEqual(d_date.day, 1) + self.assertEqual(d_date.weekday, 0) + self.assertEqual(d_date.dayofyear, 1) + self.assertEqual(d_date.days_in_month, 31) + self.assertEqual(Period(freq='D', year=2012, month=2, + day=1).days_in_month, 29) def test_properties_hourly(self): # Test properties on Periods with hourly frequency. @@ -714,50 +710,50 @@ def test_properties_hourly(self): h_date2 = Period(freq='2H', year=2007, month=1, day=1, hour=0) for h_date in [h_date1, h_date2]: - assert_equal(h_date.year, 2007) - assert_equal(h_date.quarter, 1) - assert_equal(h_date.month, 1) - assert_equal(h_date.day, 1) - assert_equal(h_date.weekday, 0) - assert_equal(h_date.dayofyear, 1) - assert_equal(h_date.hour, 0) - assert_equal(h_date.days_in_month, 31) - assert_equal(Period(freq='H', year=2012, month=2, day=1, - hour=0).days_in_month, 29) + self.assertEqual(h_date.year, 2007) + self.assertEqual(h_date.quarter, 1) + self.assertEqual(h_date.month, 1) + self.assertEqual(h_date.day, 1) + self.assertEqual(h_date.weekday, 0) + self.assertEqual(h_date.dayofyear, 1) + self.assertEqual(h_date.hour, 0) + self.assertEqual(h_date.days_in_month, 31) + self.assertEqual(Period(freq='H', year=2012, month=2, day=1, + hour=0).days_in_month, 29) def test_properties_minutely(self): # Test properties on Periods with minutely frequency. t_date = Period(freq='Min', year=2007, month=1, day=1, hour=0, minute=0) # - assert_equal(t_date.quarter, 1) - assert_equal(t_date.month, 1) - assert_equal(t_date.day, 1) - assert_equal(t_date.weekday, 0) - assert_equal(t_date.dayofyear, 1) - assert_equal(t_date.hour, 0) - assert_equal(t_date.minute, 0) - assert_equal(t_date.days_in_month, 31) - assert_equal(Period(freq='D', year=2012, month=2, day=1, hour=0, - minute=0).days_in_month, 29) + self.assertEqual(t_date.quarter, 1) + self.assertEqual(t_date.month, 1) + self.assertEqual(t_date.day, 1) + self.assertEqual(t_date.weekday, 0) + self.assertEqual(t_date.dayofyear, 1) + self.assertEqual(t_date.hour, 0) + self.assertEqual(t_date.minute, 0) + self.assertEqual(t_date.days_in_month, 31) + self.assertEqual(Period(freq='D', year=2012, month=2, day=1, hour=0, + minute=0).days_in_month, 29) def test_properties_secondly(self): # Test properties on Periods with secondly frequency. s_date = Period(freq='Min', year=2007, month=1, day=1, hour=0, minute=0, second=0) # - assert_equal(s_date.year, 2007) - assert_equal(s_date.quarter, 1) - assert_equal(s_date.month, 1) - assert_equal(s_date.day, 1) - assert_equal(s_date.weekday, 0) - assert_equal(s_date.dayofyear, 1) - assert_equal(s_date.hour, 0) - assert_equal(s_date.minute, 0) - assert_equal(s_date.second, 0) - assert_equal(s_date.days_in_month, 31) - assert_equal(Period(freq='Min', year=2012, month=2, day=1, hour=0, - minute=0, second=0).days_in_month, 29) + self.assertEqual(s_date.year, 2007) + self.assertEqual(s_date.quarter, 1) + self.assertEqual(s_date.month, 1) + self.assertEqual(s_date.day, 1) + self.assertEqual(s_date.weekday, 0) + self.assertEqual(s_date.dayofyear, 1) + self.assertEqual(s_date.hour, 0) + self.assertEqual(s_date.minute, 0) + self.assertEqual(s_date.second, 0) + self.assertEqual(s_date.days_in_month, 31) + self.assertEqual(Period(freq='Min', year=2012, month=2, day=1, hour=0, + minute=0, second=0).days_in_month, 29) def test_properties_nat(self): p_nat = Period('NaT', freq='M') @@ -829,9 +825,13 @@ def test_asfreq_MS(self): self.assertEqual(initial.asfreq(freq="M", how="S"), Period('2013-01', 'M')) - self.assertRaises(ValueError, initial.asfreq, freq="MS", how="S") - tm.assertRaisesRegexp(ValueError, "Unknown freqstr: MS", pd.Period, - '2013-01', 'MS') + + with self.assertRaisesRegexp(ValueError, "Unknown freqstr"): + initial.asfreq(freq="MS", how="S") + + with tm.assertRaisesRegexp(ValueError, "Unknown freqstr: MS"): + pd.Period('2013-01', 'MS') + self.assertTrue(_period_code_map.get("MS") is None) @@ -890,35 +890,35 @@ def test_conv_annual(self): ival_ANOV_to_D_end = Period(freq='D', year=2007, month=11, day=30) ival_ANOV_to_D_start = Period(freq='D', year=2006, month=12, day=1) - assert_equal(ival_A.asfreq('Q', 'S'), ival_A_to_Q_start) - assert_equal(ival_A.asfreq('Q', 'e'), ival_A_to_Q_end) - assert_equal(ival_A.asfreq('M', 's'), ival_A_to_M_start) - assert_equal(ival_A.asfreq('M', 'E'), ival_A_to_M_end) - assert_equal(ival_A.asfreq('W', 'S'), ival_A_to_W_start) - assert_equal(ival_A.asfreq('W', 'E'), ival_A_to_W_end) - assert_equal(ival_A.asfreq('B', 'S'), ival_A_to_B_start) - assert_equal(ival_A.asfreq('B', 'E'), ival_A_to_B_end) - assert_equal(ival_A.asfreq('D', 'S'), ival_A_to_D_start) - assert_equal(ival_A.asfreq('D', 'E'), ival_A_to_D_end) - assert_equal(ival_A.asfreq('H', 'S'), ival_A_to_H_start) - assert_equal(ival_A.asfreq('H', 'E'), ival_A_to_H_end) - assert_equal(ival_A.asfreq('min', 'S'), ival_A_to_T_start) - assert_equal(ival_A.asfreq('min', 'E'), ival_A_to_T_end) - assert_equal(ival_A.asfreq('T', 'S'), ival_A_to_T_start) - assert_equal(ival_A.asfreq('T', 'E'), ival_A_to_T_end) - assert_equal(ival_A.asfreq('S', 'S'), ival_A_to_S_start) - assert_equal(ival_A.asfreq('S', 'E'), ival_A_to_S_end) - - assert_equal(ival_AJAN.asfreq('D', 'S'), ival_AJAN_to_D_start) - assert_equal(ival_AJAN.asfreq('D', 'E'), ival_AJAN_to_D_end) - - assert_equal(ival_AJUN.asfreq('D', 'S'), ival_AJUN_to_D_start) - assert_equal(ival_AJUN.asfreq('D', 'E'), ival_AJUN_to_D_end) - - assert_equal(ival_ANOV.asfreq('D', 'S'), ival_ANOV_to_D_start) - assert_equal(ival_ANOV.asfreq('D', 'E'), ival_ANOV_to_D_end) - - assert_equal(ival_A.asfreq('A'), ival_A) + self.assertEqual(ival_A.asfreq('Q', 'S'), ival_A_to_Q_start) + self.assertEqual(ival_A.asfreq('Q', 'e'), ival_A_to_Q_end) + self.assertEqual(ival_A.asfreq('M', 's'), ival_A_to_M_start) + self.assertEqual(ival_A.asfreq('M', 'E'), ival_A_to_M_end) + self.assertEqual(ival_A.asfreq('W', 'S'), ival_A_to_W_start) + self.assertEqual(ival_A.asfreq('W', 'E'), ival_A_to_W_end) + self.assertEqual(ival_A.asfreq('B', 'S'), ival_A_to_B_start) + self.assertEqual(ival_A.asfreq('B', 'E'), ival_A_to_B_end) + self.assertEqual(ival_A.asfreq('D', 'S'), ival_A_to_D_start) + self.assertEqual(ival_A.asfreq('D', 'E'), ival_A_to_D_end) + self.assertEqual(ival_A.asfreq('H', 'S'), ival_A_to_H_start) + self.assertEqual(ival_A.asfreq('H', 'E'), ival_A_to_H_end) + self.assertEqual(ival_A.asfreq('min', 'S'), ival_A_to_T_start) + self.assertEqual(ival_A.asfreq('min', 'E'), ival_A_to_T_end) + self.assertEqual(ival_A.asfreq('T', 'S'), ival_A_to_T_start) + self.assertEqual(ival_A.asfreq('T', 'E'), ival_A_to_T_end) + self.assertEqual(ival_A.asfreq('S', 'S'), ival_A_to_S_start) + self.assertEqual(ival_A.asfreq('S', 'E'), ival_A_to_S_end) + + self.assertEqual(ival_AJAN.asfreq('D', 'S'), ival_AJAN_to_D_start) + self.assertEqual(ival_AJAN.asfreq('D', 'E'), ival_AJAN_to_D_end) + + self.assertEqual(ival_AJUN.asfreq('D', 'S'), ival_AJUN_to_D_start) + self.assertEqual(ival_AJUN.asfreq('D', 'E'), ival_AJUN_to_D_end) + + self.assertEqual(ival_ANOV.asfreq('D', 'S'), ival_ANOV_to_D_start) + self.assertEqual(ival_ANOV.asfreq('D', 'E'), ival_ANOV_to_D_end) + + self.assertEqual(ival_A.asfreq('A'), ival_A) def test_conv_quarterly(self): # frequency conversion tests: from Quarterly Frequency @@ -955,30 +955,30 @@ def test_conv_quarterly(self): ival_QEJUN_to_D_start = Period(freq='D', year=2006, month=7, day=1) ival_QEJUN_to_D_end = Period(freq='D', year=2006, month=9, day=30) - assert_equal(ival_Q.asfreq('A'), ival_Q_to_A) - assert_equal(ival_Q_end_of_year.asfreq('A'), ival_Q_to_A) - - assert_equal(ival_Q.asfreq('M', 'S'), ival_Q_to_M_start) - assert_equal(ival_Q.asfreq('M', 'E'), ival_Q_to_M_end) - assert_equal(ival_Q.asfreq('W', 'S'), ival_Q_to_W_start) - assert_equal(ival_Q.asfreq('W', 'E'), ival_Q_to_W_end) - assert_equal(ival_Q.asfreq('B', 'S'), ival_Q_to_B_start) - assert_equal(ival_Q.asfreq('B', 'E'), ival_Q_to_B_end) - assert_equal(ival_Q.asfreq('D', 'S'), ival_Q_to_D_start) - assert_equal(ival_Q.asfreq('D', 'E'), ival_Q_to_D_end) - assert_equal(ival_Q.asfreq('H', 'S'), ival_Q_to_H_start) - assert_equal(ival_Q.asfreq('H', 'E'), ival_Q_to_H_end) - assert_equal(ival_Q.asfreq('Min', 'S'), ival_Q_to_T_start) - assert_equal(ival_Q.asfreq('Min', 'E'), ival_Q_to_T_end) - assert_equal(ival_Q.asfreq('S', 'S'), ival_Q_to_S_start) - assert_equal(ival_Q.asfreq('S', 'E'), ival_Q_to_S_end) - - assert_equal(ival_QEJAN.asfreq('D', 'S'), ival_QEJAN_to_D_start) - assert_equal(ival_QEJAN.asfreq('D', 'E'), ival_QEJAN_to_D_end) - assert_equal(ival_QEJUN.asfreq('D', 'S'), ival_QEJUN_to_D_start) - assert_equal(ival_QEJUN.asfreq('D', 'E'), ival_QEJUN_to_D_end) - - assert_equal(ival_Q.asfreq('Q'), ival_Q) + self.assertEqual(ival_Q.asfreq('A'), ival_Q_to_A) + self.assertEqual(ival_Q_end_of_year.asfreq('A'), ival_Q_to_A) + + self.assertEqual(ival_Q.asfreq('M', 'S'), ival_Q_to_M_start) + self.assertEqual(ival_Q.asfreq('M', 'E'), ival_Q_to_M_end) + self.assertEqual(ival_Q.asfreq('W', 'S'), ival_Q_to_W_start) + self.assertEqual(ival_Q.asfreq('W', 'E'), ival_Q_to_W_end) + self.assertEqual(ival_Q.asfreq('B', 'S'), ival_Q_to_B_start) + self.assertEqual(ival_Q.asfreq('B', 'E'), ival_Q_to_B_end) + self.assertEqual(ival_Q.asfreq('D', 'S'), ival_Q_to_D_start) + self.assertEqual(ival_Q.asfreq('D', 'E'), ival_Q_to_D_end) + self.assertEqual(ival_Q.asfreq('H', 'S'), ival_Q_to_H_start) + self.assertEqual(ival_Q.asfreq('H', 'E'), ival_Q_to_H_end) + self.assertEqual(ival_Q.asfreq('Min', 'S'), ival_Q_to_T_start) + self.assertEqual(ival_Q.asfreq('Min', 'E'), ival_Q_to_T_end) + self.assertEqual(ival_Q.asfreq('S', 'S'), ival_Q_to_S_start) + self.assertEqual(ival_Q.asfreq('S', 'E'), ival_Q_to_S_end) + + self.assertEqual(ival_QEJAN.asfreq('D', 'S'), ival_QEJAN_to_D_start) + self.assertEqual(ival_QEJAN.asfreq('D', 'E'), ival_QEJAN_to_D_end) + self.assertEqual(ival_QEJUN.asfreq('D', 'S'), ival_QEJUN_to_D_start) + self.assertEqual(ival_QEJUN.asfreq('D', 'E'), ival_QEJUN_to_D_end) + + self.assertEqual(ival_Q.asfreq('Q'), ival_Q) def test_conv_monthly(self): # frequency conversion tests: from Monthly Frequency @@ -1005,25 +1005,25 @@ def test_conv_monthly(self): ival_M_to_S_end = Period(freq='S', year=2007, month=1, day=31, hour=23, minute=59, second=59) - assert_equal(ival_M.asfreq('A'), ival_M_to_A) - assert_equal(ival_M_end_of_year.asfreq('A'), ival_M_to_A) - assert_equal(ival_M.asfreq('Q'), ival_M_to_Q) - assert_equal(ival_M_end_of_quarter.asfreq('Q'), ival_M_to_Q) - - assert_equal(ival_M.asfreq('W', 'S'), ival_M_to_W_start) - assert_equal(ival_M.asfreq('W', 'E'), ival_M_to_W_end) - assert_equal(ival_M.asfreq('B', 'S'), ival_M_to_B_start) - assert_equal(ival_M.asfreq('B', 'E'), ival_M_to_B_end) - assert_equal(ival_M.asfreq('D', 'S'), ival_M_to_D_start) - assert_equal(ival_M.asfreq('D', 'E'), ival_M_to_D_end) - assert_equal(ival_M.asfreq('H', 'S'), ival_M_to_H_start) - assert_equal(ival_M.asfreq('H', 'E'), ival_M_to_H_end) - assert_equal(ival_M.asfreq('Min', 'S'), ival_M_to_T_start) - assert_equal(ival_M.asfreq('Min', 'E'), ival_M_to_T_end) - assert_equal(ival_M.asfreq('S', 'S'), ival_M_to_S_start) - assert_equal(ival_M.asfreq('S', 'E'), ival_M_to_S_end) - - assert_equal(ival_M.asfreq('M'), ival_M) + self.assertEqual(ival_M.asfreq('A'), ival_M_to_A) + self.assertEqual(ival_M_end_of_year.asfreq('A'), ival_M_to_A) + self.assertEqual(ival_M.asfreq('Q'), ival_M_to_Q) + self.assertEqual(ival_M_end_of_quarter.asfreq('Q'), ival_M_to_Q) + + self.assertEqual(ival_M.asfreq('W', 'S'), ival_M_to_W_start) + self.assertEqual(ival_M.asfreq('W', 'E'), ival_M_to_W_end) + self.assertEqual(ival_M.asfreq('B', 'S'), ival_M_to_B_start) + self.assertEqual(ival_M.asfreq('B', 'E'), ival_M_to_B_end) + self.assertEqual(ival_M.asfreq('D', 'S'), ival_M_to_D_start) + self.assertEqual(ival_M.asfreq('D', 'E'), ival_M_to_D_end) + self.assertEqual(ival_M.asfreq('H', 'S'), ival_M_to_H_start) + self.assertEqual(ival_M.asfreq('H', 'E'), ival_M_to_H_end) + self.assertEqual(ival_M.asfreq('Min', 'S'), ival_M_to_T_start) + self.assertEqual(ival_M.asfreq('Min', 'E'), ival_M_to_T_end) + self.assertEqual(ival_M.asfreq('S', 'S'), ival_M_to_S_start) + self.assertEqual(ival_M.asfreq('S', 'E'), ival_M_to_S_end) + + self.assertEqual(ival_M.asfreq('M'), ival_M) def test_conv_weekly(self): # frequency conversion tests: from Weekly Frequency @@ -1089,43 +1089,45 @@ def test_conv_weekly(self): ival_W_to_S_end = Period(freq='S', year=2007, month=1, day=7, hour=23, minute=59, second=59) - assert_equal(ival_W.asfreq('A'), ival_W_to_A) - assert_equal(ival_W_end_of_year.asfreq('A'), ival_W_to_A_end_of_year) - assert_equal(ival_W.asfreq('Q'), ival_W_to_Q) - assert_equal(ival_W_end_of_quarter.asfreq('Q'), - ival_W_to_Q_end_of_quarter) - assert_equal(ival_W.asfreq('M'), ival_W_to_M) - assert_equal(ival_W_end_of_month.asfreq('M'), ival_W_to_M_end_of_month) - - assert_equal(ival_W.asfreq('B', 'S'), ival_W_to_B_start) - assert_equal(ival_W.asfreq('B', 'E'), ival_W_to_B_end) - - assert_equal(ival_W.asfreq('D', 'S'), ival_W_to_D_start) - assert_equal(ival_W.asfreq('D', 'E'), ival_W_to_D_end) - - assert_equal(ival_WSUN.asfreq('D', 'S'), ival_WSUN_to_D_start) - assert_equal(ival_WSUN.asfreq('D', 'E'), ival_WSUN_to_D_end) - assert_equal(ival_WSAT.asfreq('D', 'S'), ival_WSAT_to_D_start) - assert_equal(ival_WSAT.asfreq('D', 'E'), ival_WSAT_to_D_end) - assert_equal(ival_WFRI.asfreq('D', 'S'), ival_WFRI_to_D_start) - assert_equal(ival_WFRI.asfreq('D', 'E'), ival_WFRI_to_D_end) - assert_equal(ival_WTHU.asfreq('D', 'S'), ival_WTHU_to_D_start) - assert_equal(ival_WTHU.asfreq('D', 'E'), ival_WTHU_to_D_end) - assert_equal(ival_WWED.asfreq('D', 'S'), ival_WWED_to_D_start) - assert_equal(ival_WWED.asfreq('D', 'E'), ival_WWED_to_D_end) - assert_equal(ival_WTUE.asfreq('D', 'S'), ival_WTUE_to_D_start) - assert_equal(ival_WTUE.asfreq('D', 'E'), ival_WTUE_to_D_end) - assert_equal(ival_WMON.asfreq('D', 'S'), ival_WMON_to_D_start) - assert_equal(ival_WMON.asfreq('D', 'E'), ival_WMON_to_D_end) - - assert_equal(ival_W.asfreq('H', 'S'), ival_W_to_H_start) - assert_equal(ival_W.asfreq('H', 'E'), ival_W_to_H_end) - assert_equal(ival_W.asfreq('Min', 'S'), ival_W_to_T_start) - assert_equal(ival_W.asfreq('Min', 'E'), ival_W_to_T_end) - assert_equal(ival_W.asfreq('S', 'S'), ival_W_to_S_start) - assert_equal(ival_W.asfreq('S', 'E'), ival_W_to_S_end) - - assert_equal(ival_W.asfreq('W'), ival_W) + self.assertEqual(ival_W.asfreq('A'), ival_W_to_A) + self.assertEqual(ival_W_end_of_year.asfreq('A'), + ival_W_to_A_end_of_year) + self.assertEqual(ival_W.asfreq('Q'), ival_W_to_Q) + self.assertEqual(ival_W_end_of_quarter.asfreq('Q'), + ival_W_to_Q_end_of_quarter) + self.assertEqual(ival_W.asfreq('M'), ival_W_to_M) + self.assertEqual(ival_W_end_of_month.asfreq('M'), + ival_W_to_M_end_of_month) + + self.assertEqual(ival_W.asfreq('B', 'S'), ival_W_to_B_start) + self.assertEqual(ival_W.asfreq('B', 'E'), ival_W_to_B_end) + + self.assertEqual(ival_W.asfreq('D', 'S'), ival_W_to_D_start) + self.assertEqual(ival_W.asfreq('D', 'E'), ival_W_to_D_end) + + self.assertEqual(ival_WSUN.asfreq('D', 'S'), ival_WSUN_to_D_start) + self.assertEqual(ival_WSUN.asfreq('D', 'E'), ival_WSUN_to_D_end) + self.assertEqual(ival_WSAT.asfreq('D', 'S'), ival_WSAT_to_D_start) + self.assertEqual(ival_WSAT.asfreq('D', 'E'), ival_WSAT_to_D_end) + self.assertEqual(ival_WFRI.asfreq('D', 'S'), ival_WFRI_to_D_start) + self.assertEqual(ival_WFRI.asfreq('D', 'E'), ival_WFRI_to_D_end) + self.assertEqual(ival_WTHU.asfreq('D', 'S'), ival_WTHU_to_D_start) + self.assertEqual(ival_WTHU.asfreq('D', 'E'), ival_WTHU_to_D_end) + self.assertEqual(ival_WWED.asfreq('D', 'S'), ival_WWED_to_D_start) + self.assertEqual(ival_WWED.asfreq('D', 'E'), ival_WWED_to_D_end) + self.assertEqual(ival_WTUE.asfreq('D', 'S'), ival_WTUE_to_D_start) + self.assertEqual(ival_WTUE.asfreq('D', 'E'), ival_WTUE_to_D_end) + self.assertEqual(ival_WMON.asfreq('D', 'S'), ival_WMON_to_D_start) + self.assertEqual(ival_WMON.asfreq('D', 'E'), ival_WMON_to_D_end) + + self.assertEqual(ival_W.asfreq('H', 'S'), ival_W_to_H_start) + self.assertEqual(ival_W.asfreq('H', 'E'), ival_W_to_H_end) + self.assertEqual(ival_W.asfreq('Min', 'S'), ival_W_to_T_start) + self.assertEqual(ival_W.asfreq('Min', 'E'), ival_W_to_T_end) + self.assertEqual(ival_W.asfreq('S', 'S'), ival_W_to_S_start) + self.assertEqual(ival_W.asfreq('S', 'E'), ival_W_to_S_end) + + self.assertEqual(ival_W.asfreq('W'), ival_W) def test_conv_weekly_legacy(self): # frequency conversion tests: from Weekly Frequency @@ -1204,44 +1206,46 @@ def test_conv_weekly_legacy(self): ival_W_to_S_end = Period(freq='S', year=2007, month=1, day=7, hour=23, minute=59, second=59) - assert_equal(ival_W.asfreq('A'), ival_W_to_A) - assert_equal(ival_W_end_of_year.asfreq('A'), ival_W_to_A_end_of_year) - assert_equal(ival_W.asfreq('Q'), ival_W_to_Q) - assert_equal(ival_W_end_of_quarter.asfreq('Q'), - ival_W_to_Q_end_of_quarter) - assert_equal(ival_W.asfreq('M'), ival_W_to_M) - assert_equal(ival_W_end_of_month.asfreq('M'), ival_W_to_M_end_of_month) - - assert_equal(ival_W.asfreq('B', 'S'), ival_W_to_B_start) - assert_equal(ival_W.asfreq('B', 'E'), ival_W_to_B_end) - - assert_equal(ival_W.asfreq('D', 'S'), ival_W_to_D_start) - assert_equal(ival_W.asfreq('D', 'E'), ival_W_to_D_end) - - assert_equal(ival_WSUN.asfreq('D', 'S'), ival_WSUN_to_D_start) - assert_equal(ival_WSUN.asfreq('D', 'E'), ival_WSUN_to_D_end) - assert_equal(ival_WSAT.asfreq('D', 'S'), ival_WSAT_to_D_start) - assert_equal(ival_WSAT.asfreq('D', 'E'), ival_WSAT_to_D_end) - assert_equal(ival_WFRI.asfreq('D', 'S'), ival_WFRI_to_D_start) - assert_equal(ival_WFRI.asfreq('D', 'E'), ival_WFRI_to_D_end) - assert_equal(ival_WTHU.asfreq('D', 'S'), ival_WTHU_to_D_start) - assert_equal(ival_WTHU.asfreq('D', 'E'), ival_WTHU_to_D_end) - assert_equal(ival_WWED.asfreq('D', 'S'), ival_WWED_to_D_start) - assert_equal(ival_WWED.asfreq('D', 'E'), ival_WWED_to_D_end) - assert_equal(ival_WTUE.asfreq('D', 'S'), ival_WTUE_to_D_start) - assert_equal(ival_WTUE.asfreq('D', 'E'), ival_WTUE_to_D_end) - assert_equal(ival_WMON.asfreq('D', 'S'), ival_WMON_to_D_start) - assert_equal(ival_WMON.asfreq('D', 'E'), ival_WMON_to_D_end) - - assert_equal(ival_W.asfreq('H', 'S'), ival_W_to_H_start) - assert_equal(ival_W.asfreq('H', 'E'), ival_W_to_H_end) - assert_equal(ival_W.asfreq('Min', 'S'), ival_W_to_T_start) - assert_equal(ival_W.asfreq('Min', 'E'), ival_W_to_T_end) - assert_equal(ival_W.asfreq('S', 'S'), ival_W_to_S_start) - assert_equal(ival_W.asfreq('S', 'E'), ival_W_to_S_end) + self.assertEqual(ival_W.asfreq('A'), ival_W_to_A) + self.assertEqual(ival_W_end_of_year.asfreq('A'), + ival_W_to_A_end_of_year) + self.assertEqual(ival_W.asfreq('Q'), ival_W_to_Q) + self.assertEqual(ival_W_end_of_quarter.asfreq('Q'), + ival_W_to_Q_end_of_quarter) + self.assertEqual(ival_W.asfreq('M'), ival_W_to_M) + self.assertEqual(ival_W_end_of_month.asfreq('M'), + ival_W_to_M_end_of_month) + + self.assertEqual(ival_W.asfreq('B', 'S'), ival_W_to_B_start) + self.assertEqual(ival_W.asfreq('B', 'E'), ival_W_to_B_end) + + self.assertEqual(ival_W.asfreq('D', 'S'), ival_W_to_D_start) + self.assertEqual(ival_W.asfreq('D', 'E'), ival_W_to_D_end) + + self.assertEqual(ival_WSUN.asfreq('D', 'S'), ival_WSUN_to_D_start) + self.assertEqual(ival_WSUN.asfreq('D', 'E'), ival_WSUN_to_D_end) + self.assertEqual(ival_WSAT.asfreq('D', 'S'), ival_WSAT_to_D_start) + self.assertEqual(ival_WSAT.asfreq('D', 'E'), ival_WSAT_to_D_end) + self.assertEqual(ival_WFRI.asfreq('D', 'S'), ival_WFRI_to_D_start) + self.assertEqual(ival_WFRI.asfreq('D', 'E'), ival_WFRI_to_D_end) + self.assertEqual(ival_WTHU.asfreq('D', 'S'), ival_WTHU_to_D_start) + self.assertEqual(ival_WTHU.asfreq('D', 'E'), ival_WTHU_to_D_end) + self.assertEqual(ival_WWED.asfreq('D', 'S'), ival_WWED_to_D_start) + self.assertEqual(ival_WWED.asfreq('D', 'E'), ival_WWED_to_D_end) + self.assertEqual(ival_WTUE.asfreq('D', 'S'), ival_WTUE_to_D_start) + self.assertEqual(ival_WTUE.asfreq('D', 'E'), ival_WTUE_to_D_end) + self.assertEqual(ival_WMON.asfreq('D', 'S'), ival_WMON_to_D_start) + self.assertEqual(ival_WMON.asfreq('D', 'E'), ival_WMON_to_D_end) + + self.assertEqual(ival_W.asfreq('H', 'S'), ival_W_to_H_start) + self.assertEqual(ival_W.asfreq('H', 'E'), ival_W_to_H_end) + self.assertEqual(ival_W.asfreq('Min', 'S'), ival_W_to_T_start) + self.assertEqual(ival_W.asfreq('Min', 'E'), ival_W_to_T_end) + self.assertEqual(ival_W.asfreq('S', 'S'), ival_W_to_S_start) + self.assertEqual(ival_W.asfreq('S', 'E'), ival_W_to_S_end) with tm.assert_produces_warning(FutureWarning): - assert_equal(ival_W.asfreq('WK'), ival_W) + self.assertEqual(ival_W.asfreq('WK'), ival_W) def test_conv_business(self): # frequency conversion tests: from Business Frequency" @@ -1268,25 +1272,25 @@ def test_conv_business(self): ival_B_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=23, minute=59, second=59) - assert_equal(ival_B.asfreq('A'), ival_B_to_A) - assert_equal(ival_B_end_of_year.asfreq('A'), ival_B_to_A) - assert_equal(ival_B.asfreq('Q'), ival_B_to_Q) - assert_equal(ival_B_end_of_quarter.asfreq('Q'), ival_B_to_Q) - assert_equal(ival_B.asfreq('M'), ival_B_to_M) - assert_equal(ival_B_end_of_month.asfreq('M'), ival_B_to_M) - assert_equal(ival_B.asfreq('W'), ival_B_to_W) - assert_equal(ival_B_end_of_week.asfreq('W'), ival_B_to_W) + self.assertEqual(ival_B.asfreq('A'), ival_B_to_A) + self.assertEqual(ival_B_end_of_year.asfreq('A'), ival_B_to_A) + self.assertEqual(ival_B.asfreq('Q'), ival_B_to_Q) + self.assertEqual(ival_B_end_of_quarter.asfreq('Q'), ival_B_to_Q) + self.assertEqual(ival_B.asfreq('M'), ival_B_to_M) + self.assertEqual(ival_B_end_of_month.asfreq('M'), ival_B_to_M) + self.assertEqual(ival_B.asfreq('W'), ival_B_to_W) + self.assertEqual(ival_B_end_of_week.asfreq('W'), ival_B_to_W) - assert_equal(ival_B.asfreq('D'), ival_B_to_D) + self.assertEqual(ival_B.asfreq('D'), ival_B_to_D) - assert_equal(ival_B.asfreq('H', 'S'), ival_B_to_H_start) - assert_equal(ival_B.asfreq('H', 'E'), ival_B_to_H_end) - assert_equal(ival_B.asfreq('Min', 'S'), ival_B_to_T_start) - assert_equal(ival_B.asfreq('Min', 'E'), ival_B_to_T_end) - assert_equal(ival_B.asfreq('S', 'S'), ival_B_to_S_start) - assert_equal(ival_B.asfreq('S', 'E'), ival_B_to_S_end) + self.assertEqual(ival_B.asfreq('H', 'S'), ival_B_to_H_start) + self.assertEqual(ival_B.asfreq('H', 'E'), ival_B_to_H_end) + self.assertEqual(ival_B.asfreq('Min', 'S'), ival_B_to_T_start) + self.assertEqual(ival_B.asfreq('Min', 'E'), ival_B_to_T_end) + self.assertEqual(ival_B.asfreq('S', 'S'), ival_B_to_S_start) + self.assertEqual(ival_B.asfreq('S', 'E'), ival_B_to_S_end) - assert_equal(ival_B.asfreq('B'), ival_B) + self.assertEqual(ival_B.asfreq('B'), ival_B) def test_conv_daily(self): # frequency conversion tests: from Business Frequency" @@ -1331,36 +1335,39 @@ def test_conv_daily(self): ival_D_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=23, minute=59, second=59) - assert_equal(ival_D.asfreq('A'), ival_D_to_A) - - assert_equal(ival_D_end_of_quarter.asfreq('A-JAN'), ival_Deoq_to_AJAN) - assert_equal(ival_D_end_of_quarter.asfreq('A-JUN'), ival_Deoq_to_AJUN) - assert_equal(ival_D_end_of_quarter.asfreq('A-DEC'), ival_Deoq_to_ADEC) - - assert_equal(ival_D_end_of_year.asfreq('A'), ival_D_to_A) - assert_equal(ival_D_end_of_quarter.asfreq('Q'), ival_D_to_QEDEC) - assert_equal(ival_D.asfreq("Q-JAN"), ival_D_to_QEJAN) - assert_equal(ival_D.asfreq("Q-JUN"), ival_D_to_QEJUN) - assert_equal(ival_D.asfreq("Q-DEC"), ival_D_to_QEDEC) - assert_equal(ival_D.asfreq('M'), ival_D_to_M) - assert_equal(ival_D_end_of_month.asfreq('M'), ival_D_to_M) - assert_equal(ival_D.asfreq('W'), ival_D_to_W) - assert_equal(ival_D_end_of_week.asfreq('W'), ival_D_to_W) - - assert_equal(ival_D_friday.asfreq('B'), ival_B_friday) - assert_equal(ival_D_saturday.asfreq('B', 'S'), ival_B_friday) - assert_equal(ival_D_saturday.asfreq('B', 'E'), ival_B_monday) - assert_equal(ival_D_sunday.asfreq('B', 'S'), ival_B_friday) - assert_equal(ival_D_sunday.asfreq('B', 'E'), ival_B_monday) - - assert_equal(ival_D.asfreq('H', 'S'), ival_D_to_H_start) - assert_equal(ival_D.asfreq('H', 'E'), ival_D_to_H_end) - assert_equal(ival_D.asfreq('Min', 'S'), ival_D_to_T_start) - assert_equal(ival_D.asfreq('Min', 'E'), ival_D_to_T_end) - assert_equal(ival_D.asfreq('S', 'S'), ival_D_to_S_start) - assert_equal(ival_D.asfreq('S', 'E'), ival_D_to_S_end) - - assert_equal(ival_D.asfreq('D'), ival_D) + self.assertEqual(ival_D.asfreq('A'), ival_D_to_A) + + self.assertEqual(ival_D_end_of_quarter.asfreq('A-JAN'), + ival_Deoq_to_AJAN) + self.assertEqual(ival_D_end_of_quarter.asfreq('A-JUN'), + ival_Deoq_to_AJUN) + self.assertEqual(ival_D_end_of_quarter.asfreq('A-DEC'), + ival_Deoq_to_ADEC) + + self.assertEqual(ival_D_end_of_year.asfreq('A'), ival_D_to_A) + self.assertEqual(ival_D_end_of_quarter.asfreq('Q'), ival_D_to_QEDEC) + self.assertEqual(ival_D.asfreq("Q-JAN"), ival_D_to_QEJAN) + self.assertEqual(ival_D.asfreq("Q-JUN"), ival_D_to_QEJUN) + self.assertEqual(ival_D.asfreq("Q-DEC"), ival_D_to_QEDEC) + self.assertEqual(ival_D.asfreq('M'), ival_D_to_M) + self.assertEqual(ival_D_end_of_month.asfreq('M'), ival_D_to_M) + self.assertEqual(ival_D.asfreq('W'), ival_D_to_W) + self.assertEqual(ival_D_end_of_week.asfreq('W'), ival_D_to_W) + + self.assertEqual(ival_D_friday.asfreq('B'), ival_B_friday) + self.assertEqual(ival_D_saturday.asfreq('B', 'S'), ival_B_friday) + self.assertEqual(ival_D_saturday.asfreq('B', 'E'), ival_B_monday) + self.assertEqual(ival_D_sunday.asfreq('B', 'S'), ival_B_friday) + self.assertEqual(ival_D_sunday.asfreq('B', 'E'), ival_B_monday) + + self.assertEqual(ival_D.asfreq('H', 'S'), ival_D_to_H_start) + self.assertEqual(ival_D.asfreq('H', 'E'), ival_D_to_H_end) + self.assertEqual(ival_D.asfreq('Min', 'S'), ival_D_to_T_start) + self.assertEqual(ival_D.asfreq('Min', 'E'), ival_D_to_T_end) + self.assertEqual(ival_D.asfreq('S', 'S'), ival_D_to_S_start) + self.assertEqual(ival_D.asfreq('S', 'E'), ival_D_to_S_end) + + self.assertEqual(ival_D.asfreq('D'), ival_D) def test_conv_hourly(self): # frequency conversion tests: from Hourly Frequency" @@ -1395,25 +1402,25 @@ def test_conv_hourly(self): ival_H_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=0, minute=59, second=59) - assert_equal(ival_H.asfreq('A'), ival_H_to_A) - assert_equal(ival_H_end_of_year.asfreq('A'), ival_H_to_A) - assert_equal(ival_H.asfreq('Q'), ival_H_to_Q) - assert_equal(ival_H_end_of_quarter.asfreq('Q'), ival_H_to_Q) - assert_equal(ival_H.asfreq('M'), ival_H_to_M) - assert_equal(ival_H_end_of_month.asfreq('M'), ival_H_to_M) - assert_equal(ival_H.asfreq('W'), ival_H_to_W) - assert_equal(ival_H_end_of_week.asfreq('W'), ival_H_to_W) - assert_equal(ival_H.asfreq('D'), ival_H_to_D) - assert_equal(ival_H_end_of_day.asfreq('D'), ival_H_to_D) - assert_equal(ival_H.asfreq('B'), ival_H_to_B) - assert_equal(ival_H_end_of_bus.asfreq('B'), ival_H_to_B) - - assert_equal(ival_H.asfreq('Min', 'S'), ival_H_to_T_start) - assert_equal(ival_H.asfreq('Min', 'E'), ival_H_to_T_end) - assert_equal(ival_H.asfreq('S', 'S'), ival_H_to_S_start) - assert_equal(ival_H.asfreq('S', 'E'), ival_H_to_S_end) - - assert_equal(ival_H.asfreq('H'), ival_H) + self.assertEqual(ival_H.asfreq('A'), ival_H_to_A) + self.assertEqual(ival_H_end_of_year.asfreq('A'), ival_H_to_A) + self.assertEqual(ival_H.asfreq('Q'), ival_H_to_Q) + self.assertEqual(ival_H_end_of_quarter.asfreq('Q'), ival_H_to_Q) + self.assertEqual(ival_H.asfreq('M'), ival_H_to_M) + self.assertEqual(ival_H_end_of_month.asfreq('M'), ival_H_to_M) + self.assertEqual(ival_H.asfreq('W'), ival_H_to_W) + self.assertEqual(ival_H_end_of_week.asfreq('W'), ival_H_to_W) + self.assertEqual(ival_H.asfreq('D'), ival_H_to_D) + self.assertEqual(ival_H_end_of_day.asfreq('D'), ival_H_to_D) + self.assertEqual(ival_H.asfreq('B'), ival_H_to_B) + self.assertEqual(ival_H_end_of_bus.asfreq('B'), ival_H_to_B) + + self.assertEqual(ival_H.asfreq('Min', 'S'), ival_H_to_T_start) + self.assertEqual(ival_H.asfreq('Min', 'E'), ival_H_to_T_end) + self.assertEqual(ival_H.asfreq('S', 'S'), ival_H_to_S_start) + self.assertEqual(ival_H.asfreq('S', 'E'), ival_H_to_S_end) + + self.assertEqual(ival_H.asfreq('H'), ival_H) def test_conv_minutely(self): # frequency conversion tests: from Minutely Frequency" @@ -1448,25 +1455,25 @@ def test_conv_minutely(self): ival_T_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=0, minute=0, second=59) - assert_equal(ival_T.asfreq('A'), ival_T_to_A) - assert_equal(ival_T_end_of_year.asfreq('A'), ival_T_to_A) - assert_equal(ival_T.asfreq('Q'), ival_T_to_Q) - assert_equal(ival_T_end_of_quarter.asfreq('Q'), ival_T_to_Q) - assert_equal(ival_T.asfreq('M'), ival_T_to_M) - assert_equal(ival_T_end_of_month.asfreq('M'), ival_T_to_M) - assert_equal(ival_T.asfreq('W'), ival_T_to_W) - assert_equal(ival_T_end_of_week.asfreq('W'), ival_T_to_W) - assert_equal(ival_T.asfreq('D'), ival_T_to_D) - assert_equal(ival_T_end_of_day.asfreq('D'), ival_T_to_D) - assert_equal(ival_T.asfreq('B'), ival_T_to_B) - assert_equal(ival_T_end_of_bus.asfreq('B'), ival_T_to_B) - assert_equal(ival_T.asfreq('H'), ival_T_to_H) - assert_equal(ival_T_end_of_hour.asfreq('H'), ival_T_to_H) - - assert_equal(ival_T.asfreq('S', 'S'), ival_T_to_S_start) - assert_equal(ival_T.asfreq('S', 'E'), ival_T_to_S_end) - - assert_equal(ival_T.asfreq('Min'), ival_T) + self.assertEqual(ival_T.asfreq('A'), ival_T_to_A) + self.assertEqual(ival_T_end_of_year.asfreq('A'), ival_T_to_A) + self.assertEqual(ival_T.asfreq('Q'), ival_T_to_Q) + self.assertEqual(ival_T_end_of_quarter.asfreq('Q'), ival_T_to_Q) + self.assertEqual(ival_T.asfreq('M'), ival_T_to_M) + self.assertEqual(ival_T_end_of_month.asfreq('M'), ival_T_to_M) + self.assertEqual(ival_T.asfreq('W'), ival_T_to_W) + self.assertEqual(ival_T_end_of_week.asfreq('W'), ival_T_to_W) + self.assertEqual(ival_T.asfreq('D'), ival_T_to_D) + self.assertEqual(ival_T_end_of_day.asfreq('D'), ival_T_to_D) + self.assertEqual(ival_T.asfreq('B'), ival_T_to_B) + self.assertEqual(ival_T_end_of_bus.asfreq('B'), ival_T_to_B) + self.assertEqual(ival_T.asfreq('H'), ival_T_to_H) + self.assertEqual(ival_T_end_of_hour.asfreq('H'), ival_T_to_H) + + self.assertEqual(ival_T.asfreq('S', 'S'), ival_T_to_S_start) + self.assertEqual(ival_T.asfreq('S', 'E'), ival_T_to_S_end) + + self.assertEqual(ival_T.asfreq('Min'), ival_T) def test_conv_secondly(self): # frequency conversion tests: from Secondly Frequency" @@ -1500,24 +1507,24 @@ def test_conv_secondly(self): ival_S_to_T = Period(freq='Min', year=2007, month=1, day=1, hour=0, minute=0) - assert_equal(ival_S.asfreq('A'), ival_S_to_A) - assert_equal(ival_S_end_of_year.asfreq('A'), ival_S_to_A) - assert_equal(ival_S.asfreq('Q'), ival_S_to_Q) - assert_equal(ival_S_end_of_quarter.asfreq('Q'), ival_S_to_Q) - assert_equal(ival_S.asfreq('M'), ival_S_to_M) - assert_equal(ival_S_end_of_month.asfreq('M'), ival_S_to_M) - assert_equal(ival_S.asfreq('W'), ival_S_to_W) - assert_equal(ival_S_end_of_week.asfreq('W'), ival_S_to_W) - assert_equal(ival_S.asfreq('D'), ival_S_to_D) - assert_equal(ival_S_end_of_day.asfreq('D'), ival_S_to_D) - assert_equal(ival_S.asfreq('B'), ival_S_to_B) - assert_equal(ival_S_end_of_bus.asfreq('B'), ival_S_to_B) - assert_equal(ival_S.asfreq('H'), ival_S_to_H) - assert_equal(ival_S_end_of_hour.asfreq('H'), ival_S_to_H) - assert_equal(ival_S.asfreq('Min'), ival_S_to_T) - assert_equal(ival_S_end_of_minute.asfreq('Min'), ival_S_to_T) - - assert_equal(ival_S.asfreq('S'), ival_S) + self.assertEqual(ival_S.asfreq('A'), ival_S_to_A) + self.assertEqual(ival_S_end_of_year.asfreq('A'), ival_S_to_A) + self.assertEqual(ival_S.asfreq('Q'), ival_S_to_Q) + self.assertEqual(ival_S_end_of_quarter.asfreq('Q'), ival_S_to_Q) + self.assertEqual(ival_S.asfreq('M'), ival_S_to_M) + self.assertEqual(ival_S_end_of_month.asfreq('M'), ival_S_to_M) + self.assertEqual(ival_S.asfreq('W'), ival_S_to_W) + self.assertEqual(ival_S_end_of_week.asfreq('W'), ival_S_to_W) + self.assertEqual(ival_S.asfreq('D'), ival_S_to_D) + self.assertEqual(ival_S_end_of_day.asfreq('D'), ival_S_to_D) + self.assertEqual(ival_S.asfreq('B'), ival_S_to_B) + self.assertEqual(ival_S_end_of_bus.asfreq('B'), ival_S_to_B) + self.assertEqual(ival_S.asfreq('H'), ival_S_to_H) + self.assertEqual(ival_S_end_of_hour.asfreq('H'), ival_S_to_H) + self.assertEqual(ival_S.asfreq('Min'), ival_S_to_T) + self.assertEqual(ival_S_end_of_minute.asfreq('Min'), ival_S_to_T) + + self.assertEqual(ival_S.asfreq('S'), ival_S) def test_asfreq_nat(self): p = Period('NaT', freq='A') @@ -1627,18 +1634,12 @@ def test_make_time_series(self): series = Series(1, index=index) tm.assertIsInstance(series, Series) - def test_astype(self): - idx = period_range('1990', '2009', freq='A') - - result = idx.astype('i8') - self.assert_numpy_array_equal(result, idx.values) - def test_constructor_use_start_freq(self): # GH #1118 p = Period('4/2/2012', freq='B') index = PeriodIndex(start=p, periods=10) expected = PeriodIndex(start='4/2/2012', periods=10, freq='B') - self.assertTrue(index.equals(expected)) + tm.assert_index_equal(index, expected) def test_constructor_field_arrays(self): # GH #1264 @@ -1648,13 +1649,13 @@ def test_constructor_field_arrays(self): index = PeriodIndex(year=years, quarter=quarters, freq='Q-DEC') expected = period_range('1990Q3', '2009Q2', freq='Q-DEC') - self.assertTrue(index.equals(expected)) + tm.assert_index_equal(index, expected) index2 = PeriodIndex(year=years, quarter=quarters, freq='2Q-DEC') tm.assert_numpy_array_equal(index.asi8, index2.asi8) index = PeriodIndex(year=years, quarter=quarters) - self.assertTrue(index.equals(expected)) + tm.assert_index_equal(index, expected) years = [2007, 2007, 2007] months = [1, 2] @@ -1669,7 +1670,7 @@ def test_constructor_field_arrays(self): months = [1, 2, 3] idx = PeriodIndex(year=years, month=months, freq='M') exp = period_range('2007-01', periods=3, freq='M') - self.assertTrue(idx.equals(exp)) + tm.assert_index_equal(idx, exp) def test_constructor_U(self): # U was used as undefined period @@ -1700,7 +1701,7 @@ def test_constructor_corner(self): result = period_range('2007-01', periods=10.5, freq='M') exp = period_range('2007-01', periods=10, freq='M') - self.assertTrue(result.equals(exp)) + tm.assert_index_equal(result, exp) def test_constructor_fromarraylike(self): idx = period_range('2007-01', periods=20, freq='M') @@ -1711,29 +1712,29 @@ def test_constructor_fromarraylike(self): data=Period('2007', freq='A')) result = PeriodIndex(iter(idx)) - self.assertTrue(result.equals(idx)) + tm.assert_index_equal(result, idx) result = PeriodIndex(idx) - self.assertTrue(result.equals(idx)) + tm.assert_index_equal(result, idx) result = PeriodIndex(idx, freq='M') - self.assertTrue(result.equals(idx)) + tm.assert_index_equal(result, idx) result = PeriodIndex(idx, freq=offsets.MonthEnd()) - self.assertTrue(result.equals(idx)) + tm.assert_index_equal(result, idx) self.assertTrue(result.freq, 'M') result = PeriodIndex(idx, freq='2M') - self.assertTrue(result.equals(idx)) + tm.assert_index_equal(result, idx.asfreq('2M')) self.assertTrue(result.freq, '2M') result = PeriodIndex(idx, freq=offsets.MonthEnd(2)) - self.assertTrue(result.equals(idx)) + tm.assert_index_equal(result, idx.asfreq('2M')) self.assertTrue(result.freq, '2M') result = PeriodIndex(idx, freq='D') exp = idx.asfreq('D', 'e') - self.assertTrue(result.equals(exp)) + tm.assert_index_equal(result, exp) def test_constructor_datetime64arr(self): vals = np.arange(100000, 100000 + 10000, 100, dtype=np.int64) @@ -1742,12 +1743,43 @@ def test_constructor_datetime64arr(self): self.assertRaises(ValueError, PeriodIndex, vals, freq='D') def test_constructor_simple_new(self): - idx = period_range('2007-01', name='p', periods=20, freq='M') + idx = period_range('2007-01', name='p', periods=2, freq='M') result = idx._simple_new(idx, 'p', freq=idx.freq) - self.assertTrue(result.equals(idx)) + tm.assert_index_equal(result, idx) result = idx._simple_new(idx.astype('i8'), 'p', freq=idx.freq) - self.assertTrue(result.equals(idx)) + tm.assert_index_equal(result, idx) + + result = idx._simple_new([pd.Period('2007-01', freq='M'), + pd.Period('2007-02', freq='M')], + 'p', freq=idx.freq) + self.assert_index_equal(result, idx) + + result = idx._simple_new(np.array([pd.Period('2007-01', freq='M'), + pd.Period('2007-02', freq='M')]), + 'p', freq=idx.freq) + self.assert_index_equal(result, idx) + + def test_constructor_simple_new_empty(self): + # GH13079 + idx = PeriodIndex([], freq='M', name='p') + result = idx._simple_new(idx, name='p', freq='M') + tm.assert_index_equal(result, idx) + + def test_constructor_simple_new_floats(self): + # GH13079 + for floats in [[1.1], np.array([1.1])]: + with self.assertRaises(TypeError): + pd.PeriodIndex._simple_new(floats, freq='M') + + def test_shallow_copy_empty(self): + + # GH13067 + idx = PeriodIndex([], freq='M') + result = idx._shallow_copy() + expected = idx + + tm.assert_index_equal(result, expected) def test_constructor_nat(self): self.assertRaises(ValueError, period_range, start='NaT', @@ -1769,14 +1801,14 @@ def test_constructor_freq_mult(self): for func in [PeriodIndex, period_range]: # must be the same, but for sure... pidx = func(start='2014-01', freq='2M', periods=4) - expected = PeriodIndex( - ['2014-01', '2014-03', '2014-05', '2014-07'], freq='M') + expected = PeriodIndex(['2014-01', '2014-03', + '2014-05', '2014-07'], freq='2M') tm.assert_index_equal(pidx, expected) pidx = func(start='2014-01-02', end='2014-01-15', freq='3D') expected = PeriodIndex(['2014-01-02', '2014-01-05', '2014-01-08', '2014-01-11', - '2014-01-14'], freq='D') + '2014-01-14'], freq='3D') tm.assert_index_equal(pidx, expected) pidx = func(end='2014-01-01 17:00', freq='4H', periods=3) @@ -1805,7 +1837,7 @@ def test_constructor_freq_mult_dti_compat(self): freqstr = str(mult) + freq pidx = PeriodIndex(start='2014-04-01', freq=freqstr, periods=10) expected = date_range(start='2014-04-01', freq=freqstr, - periods=10).to_period(freq) + periods=10).to_period(freqstr) tm.assert_index_equal(pidx, expected) def test_is_(self): @@ -1867,7 +1899,7 @@ def test_getitem_partial(self): exp = result result = ts[24:] - assert_series_equal(exp, result) + tm.assert_series_equal(exp, result) ts = ts[10:].append(ts[10:]) self.assertRaisesRegexp(KeyError, @@ -1883,7 +1915,7 @@ def test_getitem_datetime(self): dt4 = datetime(2012, 4, 20) rs = ts[dt1:dt4] - assert_series_equal(rs, ts) + tm.assert_series_equal(rs, ts) def test_slice_with_negative_step(self): ts = Series(np.arange(20), @@ -1891,9 +1923,9 @@ def test_slice_with_negative_step(self): SLC = pd.IndexSlice def assert_slices_equivalent(l_slc, i_slc): - assert_series_equal(ts[l_slc], ts.iloc[i_slc]) - assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) - assert_series_equal(ts.ix[l_slc], ts.iloc[i_slc]) + tm.assert_series_equal(ts[l_slc], ts.iloc[i_slc]) + tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) + tm.assert_series_equal(ts.ix[l_slc], ts.iloc[i_slc]) assert_slices_equivalent(SLC[Period('2014-10')::-1], SLC[9::-1]) assert_slices_equivalent(SLC['2014-10'::-1], SLC[9::-1]) @@ -1933,11 +1965,11 @@ def test_sub(self): result = rng - 5 exp = rng + (-5) - self.assertTrue(result.equals(exp)) + tm.assert_index_equal(result, exp) def test_periods_number_check(self): - self.assertRaises(ValueError, period_range, '2011-1-1', '2012-1-1', - 'B') + with tm.assertRaises(ValueError): + period_range('2011-1-1', '2012-1-1', 'B') def test_tolist(self): index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') @@ -1945,7 +1977,7 @@ def test_tolist(self): [tm.assertIsInstance(x, Period) for x in rs] recon = PeriodIndex(rs) - self.assertTrue(index.equals(recon)) + tm.assert_index_equal(index, recon) def test_to_timestamp(self): index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') @@ -1953,12 +1985,12 @@ def test_to_timestamp(self): exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC') result = series.to_timestamp(how='end') - self.assertTrue(result.index.equals(exp_index)) + tm.assert_index_equal(result.index, exp_index) self.assertEqual(result.name, 'foo') exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN') result = series.to_timestamp(how='start') - self.assertTrue(result.index.equals(exp_index)) + tm.assert_index_equal(result.index, exp_index) def _get_with_delta(delta, freq='A-DEC'): return date_range(to_datetime('1/1/2001') + delta, @@ -1967,17 +1999,17 @@ def _get_with_delta(delta, freq='A-DEC'): delta = timedelta(hours=23) result = series.to_timestamp('H', 'end') exp_index = _get_with_delta(delta) - self.assertTrue(result.index.equals(exp_index)) + tm.assert_index_equal(result.index, exp_index) delta = timedelta(hours=23, minutes=59) result = series.to_timestamp('T', 'end') exp_index = _get_with_delta(delta) - self.assertTrue(result.index.equals(exp_index)) + tm.assert_index_equal(result.index, exp_index) result = series.to_timestamp('S', 'end') delta = timedelta(hours=23, minutes=59, seconds=59) exp_index = _get_with_delta(delta) - self.assertTrue(result.index.equals(exp_index)) + tm.assert_index_equal(result.index, exp_index) index = PeriodIndex(freq='H', start='1/1/2001', end='1/2/2001') series = Series(1, index=index, name='foo') @@ -1985,7 +2017,7 @@ def _get_with_delta(delta, freq='A-DEC'): exp_index = date_range('1/1/2001 00:59:59', end='1/2/2001 00:59:59', freq='H') result = series.to_timestamp(how='end') - self.assertTrue(result.index.equals(exp_index)) + tm.assert_index_equal(result.index, exp_index) self.assertEqual(result.name, 'foo') def test_to_timestamp_quarterly_bug(self): @@ -1996,7 +2028,7 @@ def test_to_timestamp_quarterly_bug(self): stamps = pindex.to_timestamp('D', 'end') expected = DatetimeIndex([x.to_timestamp('D', 'end') for x in pindex]) - self.assertTrue(stamps.equals(expected)) + tm.assert_index_equal(stamps, expected) def test_to_timestamp_preserve_name(self): index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009', @@ -2022,11 +2054,11 @@ def test_to_timestamp_pi_nat(self): result = index.to_timestamp('D') expected = DatetimeIndex([pd.NaT, datetime(2011, 1, 1), datetime(2011, 2, 1)], name='idx') - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) self.assertEqual(result.name, 'idx') result2 = result.to_period(freq='M') - self.assertTrue(result2.equals(index)) + tm.assert_index_equal(result2, index) self.assertEqual(result2.name, 'idx') result3 = result.to_period(freq='3M') @@ -2053,25 +2085,25 @@ def test_to_timestamp_pi_mult(self): def test_start_time(self): index = PeriodIndex(freq='M', start='2016-01-01', end='2016-05-31') expected_index = date_range('2016-01-01', end='2016-05-31', freq='MS') - self.assertTrue(index.start_time.equals(expected_index)) + tm.assert_index_equal(index.start_time, expected_index) def test_end_time(self): index = PeriodIndex(freq='M', start='2016-01-01', end='2016-05-31') expected_index = date_range('2016-01-01', end='2016-05-31', freq='M') - self.assertTrue(index.end_time.equals(expected_index)) + tm.assert_index_equal(index.end_time, expected_index) def test_as_frame_columns(self): rng = period_range('1/1/2000', periods=5) df = DataFrame(randn(10, 5), columns=rng) ts = df[rng[0]] - assert_series_equal(ts, df.ix[:, 0]) + tm.assert_series_equal(ts, df.ix[:, 0]) # GH # 1211 repr(df) ts = df['1/1/2000'] - assert_series_equal(ts, df.ix[:, 0]) + tm.assert_series_equal(ts, df.ix[:, 0]) def test_indexing(self): @@ -2083,17 +2115,18 @@ def test_indexing(self): self.assertEqual(expected, result) def test_frame_setitem(self): - rng = period_range('1/1/2000', periods=5) - rng.name = 'index' + rng = period_range('1/1/2000', periods=5, name='index') df = DataFrame(randn(5, 3), index=rng) df['Index'] = rng rs = Index(df['Index']) - self.assertTrue(rs.equals(rng)) + tm.assert_index_equal(rs, rng, check_names=False) + self.assertEqual(rs.name, 'Index') + self.assertEqual(rng.name, 'index') rs = df.reset_index().set_index('index') tm.assertIsInstance(rs.index, PeriodIndex) - self.assertTrue(rs.index.equals(rng)) + tm.assert_index_equal(rs.index, rng) def test_period_set_index_reindex(self): # GH 6631 @@ -2102,9 +2135,9 @@ def test_period_set_index_reindex(self): idx2 = period_range('2013', periods=6, freq='A') df = df.set_index(idx1) - self.assertTrue(df.index.equals(idx1)) + tm.assert_index_equal(df.index, idx1) df = df.set_index(idx2) - self.assertTrue(df.index.equals(idx2)) + tm.assert_index_equal(df.index, idx2) def test_frame_to_time_stamp(self): K = 5 @@ -2114,12 +2147,12 @@ def test_frame_to_time_stamp(self): exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC') result = df.to_timestamp('D', 'end') - self.assertTrue(result.index.equals(exp_index)) - assert_almost_equal(result.values, df.values) + tm.assert_index_equal(result.index, exp_index) + tm.assert_numpy_array_equal(result.values, df.values) exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN') result = df.to_timestamp('D', 'start') - self.assertTrue(result.index.equals(exp_index)) + tm.assert_index_equal(result.index, exp_index) def _get_with_delta(delta, freq='A-DEC'): return date_range(to_datetime('1/1/2001') + delta, @@ -2128,47 +2161,47 @@ def _get_with_delta(delta, freq='A-DEC'): delta = timedelta(hours=23) result = df.to_timestamp('H', 'end') exp_index = _get_with_delta(delta) - self.assertTrue(result.index.equals(exp_index)) + tm.assert_index_equal(result.index, exp_index) delta = timedelta(hours=23, minutes=59) result = df.to_timestamp('T', 'end') exp_index = _get_with_delta(delta) - self.assertTrue(result.index.equals(exp_index)) + tm.assert_index_equal(result.index, exp_index) result = df.to_timestamp('S', 'end') delta = timedelta(hours=23, minutes=59, seconds=59) exp_index = _get_with_delta(delta) - self.assertTrue(result.index.equals(exp_index)) + tm.assert_index_equal(result.index, exp_index) # columns df = df.T exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC') result = df.to_timestamp('D', 'end', axis=1) - self.assertTrue(result.columns.equals(exp_index)) - assert_almost_equal(result.values, df.values) + tm.assert_index_equal(result.columns, exp_index) + tm.assert_numpy_array_equal(result.values, df.values) exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN') result = df.to_timestamp('D', 'start', axis=1) - self.assertTrue(result.columns.equals(exp_index)) + tm.assert_index_equal(result.columns, exp_index) delta = timedelta(hours=23) result = df.to_timestamp('H', 'end', axis=1) exp_index = _get_with_delta(delta) - self.assertTrue(result.columns.equals(exp_index)) + tm.assert_index_equal(result.columns, exp_index) delta = timedelta(hours=23, minutes=59) result = df.to_timestamp('T', 'end', axis=1) exp_index = _get_with_delta(delta) - self.assertTrue(result.columns.equals(exp_index)) + tm.assert_index_equal(result.columns, exp_index) result = df.to_timestamp('S', 'end', axis=1) delta = timedelta(hours=23, minutes=59, seconds=59) exp_index = _get_with_delta(delta) - self.assertTrue(result.columns.equals(exp_index)) + tm.assert_index_equal(result.columns, exp_index) # invalid axis - assertRaisesRegexp(ValueError, 'axis', df.to_timestamp, axis=2) + tm.assertRaisesRegexp(ValueError, 'axis', df.to_timestamp, axis=2) result1 = df.to_timestamp('5t', axis=1) result2 = df.to_timestamp('t', axis=1) @@ -2188,7 +2221,7 @@ def test_index_duplicate_periods(self): result = ts[2007] expected = ts[1:3] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result[:] = 1 self.assertTrue((ts[1:3] == 1).all()) @@ -2198,69 +2231,69 @@ def test_index_duplicate_periods(self): result = ts[2007] expected = ts[idx == 2007] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_index_unique(self): idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq='A-JUN') expected = PeriodIndex([2000, 2007, 2009], freq='A-JUN') - self.assert_numpy_array_equal(idx.unique(), expected.values) + self.assert_index_equal(idx.unique(), expected) self.assertEqual(idx.nunique(), 3) idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq='A-JUN', tz='US/Eastern') expected = PeriodIndex([2000, 2007, 2009], freq='A-JUN', tz='US/Eastern') - self.assert_numpy_array_equal(idx.unique(), expected.values) + self.assert_index_equal(idx.unique(), expected) self.assertEqual(idx.nunique(), 3) def test_constructor(self): pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') - assert_equal(len(pi), 9) + self.assertEqual(len(pi), 9) pi = PeriodIndex(freq='Q', start='1/1/2001', end='12/1/2009') - assert_equal(len(pi), 4 * 9) + self.assertEqual(len(pi), 4 * 9) pi = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') - assert_equal(len(pi), 12 * 9) + self.assertEqual(len(pi), 12 * 9) pi = PeriodIndex(freq='D', start='1/1/2001', end='12/31/2009') - assert_equal(len(pi), 365 * 9 + 2) + self.assertEqual(len(pi), 365 * 9 + 2) pi = PeriodIndex(freq='B', start='1/1/2001', end='12/31/2009') - assert_equal(len(pi), 261 * 9) + self.assertEqual(len(pi), 261 * 9) pi = PeriodIndex(freq='H', start='1/1/2001', end='12/31/2001 23:00') - assert_equal(len(pi), 365 * 24) + self.assertEqual(len(pi), 365 * 24) pi = PeriodIndex(freq='Min', start='1/1/2001', end='1/1/2001 23:59') - assert_equal(len(pi), 24 * 60) + self.assertEqual(len(pi), 24 * 60) pi = PeriodIndex(freq='S', start='1/1/2001', end='1/1/2001 23:59:59') - assert_equal(len(pi), 24 * 60 * 60) + self.assertEqual(len(pi), 24 * 60 * 60) start = Period('02-Apr-2005', 'B') i1 = PeriodIndex(start=start, periods=20) - assert_equal(len(i1), 20) - assert_equal(i1.freq, start.freq) - assert_equal(i1[0], start) + self.assertEqual(len(i1), 20) + self.assertEqual(i1.freq, start.freq) + self.assertEqual(i1[0], start) end_intv = Period('2006-12-31', 'W') i1 = PeriodIndex(end=end_intv, periods=10) - assert_equal(len(i1), 10) - assert_equal(i1.freq, end_intv.freq) - assert_equal(i1[-1], end_intv) + self.assertEqual(len(i1), 10) + self.assertEqual(i1.freq, end_intv.freq) + self.assertEqual(i1[-1], end_intv) end_intv = Period('2006-12-31', '1w') i2 = PeriodIndex(end=end_intv, periods=10) - assert_equal(len(i1), len(i2)) + self.assertEqual(len(i1), len(i2)) self.assertTrue((i1 == i2).all()) - assert_equal(i1.freq, i2.freq) + self.assertEqual(i1.freq, i2.freq) end_intv = Period('2006-12-31', ('w', 1)) i2 = PeriodIndex(end=end_intv, periods=10) - assert_equal(len(i1), len(i2)) + self.assertEqual(len(i1), len(i2)) self.assertTrue((i1 == i2).all()) - assert_equal(i1.freq, i2.freq) + self.assertEqual(i1.freq, i2.freq) try: PeriodIndex(start=start, end=end_intv) @@ -2280,12 +2313,12 @@ def test_constructor(self): # infer freq from first element i2 = PeriodIndex([end_intv, Period('2005-05-05', 'B')]) - assert_equal(len(i2), 2) - assert_equal(i2[0], end_intv) + self.assertEqual(len(i2), 2) + self.assertEqual(i2[0], end_intv) i2 = PeriodIndex(np.array([end_intv, Period('2005-05-05', 'B')])) - assert_equal(len(i2), 2) - assert_equal(i2[0], end_intv) + self.assertEqual(len(i2), 2) + self.assertEqual(i2[0], end_intv) # Mixed freq should fail vals = [end_intv, Period('2006-12-31', 'w')] @@ -2300,78 +2333,75 @@ def test_repeat(self): Period('2001-01-02'), Period('2001-01-02'), ]) - assert_index_equal(index.repeat(2), expected) + tm.assert_index_equal(index.repeat(2), expected) def test_numpy_repeat(self): index = period_range('20010101', periods=2) - expected = PeriodIndex([ - Period('2001-01-01'), Period('2001-01-01'), - Period('2001-01-02'), Period('2001-01-02'), - ]) + expected = PeriodIndex([Period('2001-01-01'), Period('2001-01-01'), + Period('2001-01-02'), Period('2001-01-02')]) - assert_index_equal(np.repeat(index, 2), expected) + tm.assert_index_equal(np.repeat(index, 2), expected) msg = "the 'axis' parameter is not supported" - assertRaisesRegexp(ValueError, msg, np.repeat, - index, 2, axis=1) + tm.assertRaisesRegexp(ValueError, msg, np.repeat, index, 2, axis=1) def test_shift(self): pi1 = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') pi2 = PeriodIndex(freq='A', start='1/1/2002', end='12/1/2010') - self.assertTrue(pi1.shift(0).equals(pi1)) + tm.assert_index_equal(pi1.shift(0), pi1) - assert_equal(len(pi1), len(pi2)) - assert_equal(pi1.shift(1).values, pi2.values) + self.assertEqual(len(pi1), len(pi2)) + self.assert_index_equal(pi1.shift(1), pi2) pi1 = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') pi2 = PeriodIndex(freq='A', start='1/1/2000', end='12/1/2008') - assert_equal(len(pi1), len(pi2)) - assert_equal(pi1.shift(-1).values, pi2.values) + self.assertEqual(len(pi1), len(pi2)) + self.assert_index_equal(pi1.shift(-1), pi2) pi1 = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') pi2 = PeriodIndex(freq='M', start='2/1/2001', end='1/1/2010') - assert_equal(len(pi1), len(pi2)) - assert_equal(pi1.shift(1).values, pi2.values) + self.assertEqual(len(pi1), len(pi2)) + self.assert_index_equal(pi1.shift(1), pi2) pi1 = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') pi2 = PeriodIndex(freq='M', start='12/1/2000', end='11/1/2009') - assert_equal(len(pi1), len(pi2)) - assert_equal(pi1.shift(-1).values, pi2.values) + self.assertEqual(len(pi1), len(pi2)) + self.assert_index_equal(pi1.shift(-1), pi2) pi1 = PeriodIndex(freq='D', start='1/1/2001', end='12/1/2009') pi2 = PeriodIndex(freq='D', start='1/2/2001', end='12/2/2009') - assert_equal(len(pi1), len(pi2)) - assert_equal(pi1.shift(1).values, pi2.values) + self.assertEqual(len(pi1), len(pi2)) + self.assert_index_equal(pi1.shift(1), pi2) pi1 = PeriodIndex(freq='D', start='1/1/2001', end='12/1/2009') pi2 = PeriodIndex(freq='D', start='12/31/2000', end='11/30/2009') - assert_equal(len(pi1), len(pi2)) - assert_equal(pi1.shift(-1).values, pi2.values) + self.assertEqual(len(pi1), len(pi2)) + self.assert_index_equal(pi1.shift(-1), pi2) def test_shift_nat(self): idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], freq='M', name='idx') result = idx.shift(1) - expected = PeriodIndex( - ['2011-02', '2011-03', 'NaT', '2011-05'], freq='M', name='idx') - self.assertTrue(result.equals(expected)) + expected = PeriodIndex(['2011-02', '2011-03', 'NaT', + '2011-05'], freq='M', name='idx') + tm.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) def test_shift_ndarray(self): idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], freq='M', name='idx') result = idx.shift(np.array([1, 2, 3, 4])) - expected = PeriodIndex( - ['2011-02', '2011-04', 'NaT', '2011-08'], freq='M', name='idx') - self.assertTrue(result.equals(expected)) + expected = PeriodIndex(['2011-02', '2011-04', 'NaT', + '2011-08'], freq='M', name='idx') + tm.assert_index_equal(result, expected) idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], freq='M', name='idx') result = idx.shift(np.array([1, -2, 3, -4])) - expected = PeriodIndex( - ['2011-02', '2010-12', 'NaT', '2010-12'], freq='M', name='idx') - self.assertTrue(result.equals(expected)) + expected = PeriodIndex(['2011-02', '2010-12', 'NaT', + '2010-12'], freq='M', name='idx') + tm.assert_index_equal(result, expected) def test_asfreq(self): pi1 = PeriodIndex(freq='A', start='1/1/2001', end='1/1/2001') @@ -2445,7 +2475,7 @@ def test_asfreq_nat(self): idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], freq='M') result = idx.asfreq(freq='Q') expected = PeriodIndex(['2011Q1', '2011Q1', 'NaT', '2011Q2'], freq='Q') - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) def test_asfreq_mult_pi(self): pi = PeriodIndex(['2001-01', '2001-02', 'NaT', '2001-03'], freq='2M') @@ -2465,37 +2495,37 @@ def test_asfreq_mult_pi(self): def test_period_index_length(self): pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') - assert_equal(len(pi), 9) + self.assertEqual(len(pi), 9) pi = PeriodIndex(freq='Q', start='1/1/2001', end='12/1/2009') - assert_equal(len(pi), 4 * 9) + self.assertEqual(len(pi), 4 * 9) pi = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') - assert_equal(len(pi), 12 * 9) + self.assertEqual(len(pi), 12 * 9) start = Period('02-Apr-2005', 'B') i1 = PeriodIndex(start=start, periods=20) - assert_equal(len(i1), 20) - assert_equal(i1.freq, start.freq) - assert_equal(i1[0], start) + self.assertEqual(len(i1), 20) + self.assertEqual(i1.freq, start.freq) + self.assertEqual(i1[0], start) end_intv = Period('2006-12-31', 'W') i1 = PeriodIndex(end=end_intv, periods=10) - assert_equal(len(i1), 10) - assert_equal(i1.freq, end_intv.freq) - assert_equal(i1[-1], end_intv) + self.assertEqual(len(i1), 10) + self.assertEqual(i1.freq, end_intv.freq) + self.assertEqual(i1[-1], end_intv) end_intv = Period('2006-12-31', '1w') i2 = PeriodIndex(end=end_intv, periods=10) - assert_equal(len(i1), len(i2)) + self.assertEqual(len(i1), len(i2)) self.assertTrue((i1 == i2).all()) - assert_equal(i1.freq, i2.freq) + self.assertEqual(i1.freq, i2.freq) end_intv = Period('2006-12-31', ('w', 1)) i2 = PeriodIndex(end=end_intv, periods=10) - assert_equal(len(i1), len(i2)) + self.assertEqual(len(i1), len(i2)) self.assertTrue((i1 == i2).all()) - assert_equal(i1.freq, i2.freq) + self.assertEqual(i1.freq, i2.freq) try: PeriodIndex(start=start, end=end_intv) @@ -2515,12 +2545,12 @@ def test_period_index_length(self): # infer freq from first element i2 = PeriodIndex([end_intv, Period('2005-05-05', 'B')]) - assert_equal(len(i2), 2) - assert_equal(i2[0], end_intv) + self.assertEqual(len(i2), 2) + self.assertEqual(i2[0], end_intv) i2 = PeriodIndex(np.array([end_intv, Period('2005-05-05', 'B')])) - assert_equal(len(i2), 2) - assert_equal(i2[0], end_intv) + self.assertEqual(len(i2), 2) + self.assertEqual(i2[0], end_intv) # Mixed freq should fail vals = [end_intv, Period('2006-12-31', 'w')] @@ -2544,12 +2574,12 @@ def test_asfreq_ts(self): df_result = df.asfreq('D', how='end') exp_index = index.asfreq('D', how='end') self.assertEqual(len(result), len(ts)) - self.assertTrue(result.index.equals(exp_index)) - self.assertTrue(df_result.index.equals(exp_index)) + tm.assert_index_equal(result.index, exp_index) + tm.assert_index_equal(df_result.index, exp_index) result = ts.asfreq('D', how='start') self.assertEqual(len(result), len(ts)) - self.assertTrue(result.index.equals(index.asfreq('D', how='start'))) + tm.assert_index_equal(result.index, index.asfreq('D', how='start')) def test_badinput(self): self.assertRaises(datetools.DateParseError, Period, '1/1/-2000', 'A') @@ -2562,7 +2592,7 @@ def test_negative_ordinals(self): idx1 = PeriodIndex(ordinal=[-1, 0, 1], freq='A') idx2 = PeriodIndex(ordinal=np.array([-1, 0, 1]), freq='A') - tm.assert_numpy_array_equal(idx1, idx2) + tm.assert_index_equal(idx1, idx2) def test_dti_to_period(self): dti = DatetimeIndex(start='1/1/2005', end='12/1/2005', freq='M') @@ -2590,10 +2620,10 @@ def test_pindex_slice_index(self): s = Series(np.random.rand(len(pi)), index=pi) res = s['2010'] exp = s[0:12] - assert_series_equal(res, exp) + tm.assert_series_equal(res, exp) res = s['2011'] exp = s[12:24] - assert_series_equal(res, exp) + tm.assert_series_equal(res, exp) def test_getitem_day(self): # GH 6716 @@ -2619,9 +2649,9 @@ def test_getitem_day(self): continue s = Series(np.random.rand(len(idx)), index=idx) - assert_series_equal(s['2013/01'], s[0:31]) - assert_series_equal(s['2013/02'], s[31:59]) - assert_series_equal(s['2014'], s[365:]) + tm.assert_series_equal(s['2013/01'], s[0:31]) + tm.assert_series_equal(s['2013/02'], s[31:59]) + tm.assert_series_equal(s['2014'], s[365:]) invalid = ['2013/02/01 9H', '2013/02/01 09:00'] for v in invalid: @@ -2647,10 +2677,10 @@ def test_range_slice_day(self): s = Series(np.random.rand(len(idx)), index=idx) - assert_series_equal(s['2013/01/02':], s[1:]) - assert_series_equal(s['2013/01/02':'2013/01/05'], s[1:5]) - assert_series_equal(s['2013/02':], s[31:]) - assert_series_equal(s['2014':], s[365:]) + tm.assert_series_equal(s['2013/01/02':], s[1:]) + tm.assert_series_equal(s['2013/01/02':'2013/01/05'], s[1:5]) + tm.assert_series_equal(s['2013/02':], s[31:]) + tm.assert_series_equal(s['2014':], s[365:]) invalid = ['2013/02/01 9H', '2013/02/01 09:00'] for v in invalid: @@ -2680,10 +2710,10 @@ def test_getitem_seconds(self): continue s = Series(np.random.rand(len(idx)), index=idx) - assert_series_equal(s['2013/01/01 10:00'], s[3600:3660]) - assert_series_equal(s['2013/01/01 9H'], s[:3600]) + tm.assert_series_equal(s['2013/01/01 10:00'], s[3600:3660]) + tm.assert_series_equal(s['2013/01/01 9H'], s[:3600]) for d in ['2013/01/01', '2013/01', '2013']: - assert_series_equal(s[d], s) + tm.assert_series_equal(s[d], s) def test_range_slice_seconds(self): # GH 6716 @@ -2705,14 +2735,14 @@ def test_range_slice_seconds(self): s = Series(np.random.rand(len(idx)), index=idx) - assert_series_equal(s['2013/01/01 09:05':'2013/01/01 09:10'], - s[300:660]) - assert_series_equal(s['2013/01/01 10:00':'2013/01/01 10:05'], - s[3600:3960]) - assert_series_equal(s['2013/01/01 10H':], s[3600:]) - assert_series_equal(s[:'2013/01/01 09:30'], s[:1860]) + tm.assert_series_equal(s['2013/01/01 09:05':'2013/01/01 09:10'], + s[300:660]) + tm.assert_series_equal(s['2013/01/01 10:00':'2013/01/01 10:05'], + s[3600:3960]) + tm.assert_series_equal(s['2013/01/01 10H':], s[3600:]) + tm.assert_series_equal(s[:'2013/01/01 09:30'], s[:1860]) for d in ['2013/01/01', '2013/01', '2013']: - assert_series_equal(s[d:], s) + tm.assert_series_equal(s[d:], s) def test_range_slice_outofbounds(self): # GH 5407 @@ -2721,8 +2751,8 @@ def test_range_slice_outofbounds(self): for idx in [didx, pidx]: df = DataFrame(dict(units=[100 + i for i in range(10)]), index=idx) - empty = DataFrame(index=idx.__class__( - [], freq='D'), columns=['units']) + empty = DataFrame(index=idx.__class__([], freq='D'), + columns=['units']) empty['units'] = empty['units'].astype('int64') tm.assert_frame_equal(df['2013/09/01':'2013/09/30'], empty) @@ -2751,11 +2781,11 @@ def test_pindex_qaccess(self): def test_period_dt64_round_trip(self): dti = date_range('1/1/2000', '1/7/2002', freq='B') pi = dti.to_period() - self.assertTrue(pi.to_timestamp().equals(dti)) + tm.assert_index_equal(pi.to_timestamp(), dti) dti = date_range('1/1/2000', '1/7/2002', freq='B') pi = dti.to_period(freq='H') - self.assertTrue(pi.to_timestamp().equals(dti)) + tm.assert_index_equal(pi.to_timestamp(), dti) def test_to_period_quarterly(self): # make sure we can make the round trip @@ -2764,7 +2794,7 @@ def test_to_period_quarterly(self): rng = period_range('1989Q3', '1991Q3', freq=freq) stamps = rng.to_timestamp() result = stamps.to_period(freq) - self.assertTrue(rng.equals(result)) + tm.assert_index_equal(rng, result) def test_to_period_quarterlyish(self): offsets = ['BQ', 'QS', 'BQS'] @@ -2809,7 +2839,7 @@ def test_multiples(self): def test_pindex_multiples(self): pi = PeriodIndex(start='1/1/11', end='12/31/11', freq='2M') expected = PeriodIndex(['2011-01', '2011-03', '2011-05', '2011-07', - '2011-09', '2011-11'], freq='M') + '2011-09', '2011-11'], freq='2M') tm.assert_index_equal(pi, expected) self.assertEqual(pi.freq, offsets.MonthEnd(2)) self.assertEqual(pi.freqstr, '2M') @@ -2842,7 +2872,7 @@ def test_take(self): taken2 = index[[5, 6, 8, 12]] for taken in [taken1, taken2]: - self.assertTrue(taken.equals(expected)) + tm.assert_index_equal(taken, expected) tm.assertIsInstance(taken, PeriodIndex) self.assertEqual(taken.freq, index.freq) self.assertEqual(taken.name, expected.name) @@ -2913,16 +2943,16 @@ def test_align_series(self): result = ts + ts[::2] expected = ts + ts expected[1::2] = np.nan - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = ts + _permute(ts[::2]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # it works! for kind in ['inner', 'outer', 'left', 'right']: ts.align(ts[::2], join=kind) msg = "Input has different freq=D from PeriodIndex\\(freq=A-DEC\\)" - with assertRaisesRegexp(ValueError, msg): + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): ts + ts.asfreq('D', how="end") def test_align_frame(self): @@ -2941,11 +2971,11 @@ def test_union(self): index = period_range('1/1/2000', '1/20/2000', freq='D') result = index[:-5].union(index[10:]) - self.assertTrue(result.equals(index)) + tm.assert_index_equal(result, index) # not in order result = _permute(index[:-5]).union(_permute(index[10:])) - self.assertTrue(result.equals(index)) + tm.assert_index_equal(result, index) # raise if different frequencies index = period_range('1/1/2000', '1/20/2000', freq='D') @@ -2976,13 +3006,13 @@ def test_intersection(self): index = period_range('1/1/2000', '1/20/2000', freq='D') result = index[:-5].intersection(index[10:]) - self.assertTrue(result.equals(index[10:-5])) + tm.assert_index_equal(result, index[10:-5]) # not in order left = _permute(index[:-5]) right = _permute(index[10:]) result = left.intersection(right).sort_values() - self.assertTrue(result.equals(index[10:-5])) + tm.assert_index_equal(result, index[10:-5]) # raise if different frequencies index = period_range('1/1/2000', '1/20/2000', freq='D') @@ -3013,7 +3043,7 @@ def test_intersection_cases(self): for (rng, expected) in [(rng2, expected2), (rng3, expected3), (rng4, expected4)]: result = base.intersection(rng) - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) self.assertEqual(result.freq, expected.freq) @@ -3039,7 +3069,7 @@ def test_intersection_cases(self): for (rng, expected) in [(rng2, expected2), (rng3, expected3), (rng4, expected4)]: result = base.intersection(rng) - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) self.assertEqual(result.freq, 'D') @@ -3093,9 +3123,9 @@ def _check_all_fields(self, periodindex): for field in fields: field_idx = getattr(periodindex, field) - assert_equal(len(periodindex), len(field_idx)) + self.assertEqual(len(periodindex), len(field_idx)) for x, val in zip(periods, field_idx): - assert_equal(getattr(x, field), val) + self.assertEqual(getattr(x, field), val) def test_is_full(self): index = PeriodIndex([2005, 2007, 2009], freq='A') @@ -3119,10 +3149,10 @@ def test_map(self): index = PeriodIndex([2005, 2007, 2009], freq='A') result = index.map(lambda x: x + 1) expected = index + 1 - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) result = index.map(lambda x: x.ordinal) - exp = [x.ordinal for x in index] + exp = np.array([x.ordinal for x in index], dtype=np.int64) tm.assert_numpy_array_equal(result, exp) def test_map_with_string_constructor(self): @@ -3220,11 +3250,11 @@ def test_factorize(self): arr, idx = idx1.factorize() self.assert_numpy_array_equal(arr, exp_arr) - self.assertTrue(idx.equals(exp_idx)) + tm.assert_index_equal(idx, exp_idx) arr, idx = idx1.factorize(sort=True) self.assert_numpy_array_equal(arr, exp_arr) - self.assertTrue(idx.equals(exp_idx)) + tm.assert_index_equal(idx, exp_idx) idx2 = pd.PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01', '2014-03', '2014-01'], freq='M') @@ -3232,19 +3262,19 @@ def test_factorize(self): exp_arr = np.array([2, 2, 1, 0, 2, 0]) arr, idx = idx2.factorize(sort=True) self.assert_numpy_array_equal(arr, exp_arr) - self.assertTrue(idx.equals(exp_idx)) + tm.assert_index_equal(idx, exp_idx) exp_arr = np.array([0, 0, 1, 2, 0, 2]) exp_idx = PeriodIndex(['2014-03', '2014-02', '2014-01'], freq='M') arr, idx = idx2.factorize() self.assert_numpy_array_equal(arr, exp_arr) - self.assertTrue(idx.equals(exp_idx)) + tm.assert_index_equal(idx, exp_idx) def test_recreate_from_data(self): for o in ['M', 'Q', 'A', 'D', 'B', 'T', 'S', 'L', 'U', 'N', 'H']: org = PeriodIndex(start='2001/04/01', freq=o, periods=1) idx = PeriodIndex(org.values, freq=o) - self.assertTrue(idx.equals(org)) + tm.assert_index_equal(idx, org) def test_combine_first(self): # GH 3367 @@ -3292,13 +3322,12 @@ def _permute(obj): class TestMethods(tm.TestCase): - "Base test class for MaskedArrays." def test_add(self): dt1 = Period(freq='D', year=2008, month=1, day=1) dt2 = Period(freq='D', year=2008, month=1, day=2) - assert_equal(dt1 + 1, dt2) - assert_equal(1 + dt1, dt2) + self.assertEqual(dt1 + 1, dt2) + self.assertEqual(1 + dt1, dt2) def test_add_pdnat(self): p = pd.Period('2011-01', freq='M') @@ -3324,6 +3353,17 @@ def test_add_raises(self): with tm.assertRaisesRegexp(TypeError, msg): dt1 + dt2 + def test_sub(self): + dt1 = Period('2011-01-01', freq='D') + dt2 = Period('2011-01-15', freq='D') + + self.assertEqual(dt1 - dt2, -14) + self.assertEqual(dt2 - dt1, 14) + + msg = "Input has different freq=M from Period\(freq=D\)" + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + dt1 - pd.Period('2011-02', freq='M') + def test_add_offset(self): # freq is DateOffset for freq in ['A', '2A', '3A']: @@ -3335,14 +3375,14 @@ def test_add_offset(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p + o if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): o + p for freq in ['M', '2M', '3M']: @@ -3358,14 +3398,14 @@ def test_add_offset(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p + o if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): o + p # freq is Tick @@ -3401,14 +3441,14 @@ def test_add_offset(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(4, 'h'), timedelta(hours=23)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p + o if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): o + p for freq in ['H', '2H', '3H']: @@ -3443,14 +3483,14 @@ def test_add_offset(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(3200, 's'), timedelta(hours=23, minutes=30)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p + o if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): o + p def test_add_offset_nat(self): @@ -3464,14 +3504,14 @@ def test_add_offset_nat(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p + o if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): o + p for freq in ['M', '2M', '3M']: @@ -3488,14 +3528,14 @@ def test_add_offset_nat(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p + o if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): o + p # freq is Tick for freq in ['D', '2D', '3D']: @@ -3515,14 +3555,14 @@ def test_add_offset_nat(self): offsets.Minute(), np.timedelta64(4, 'h'), timedelta(hours=23)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p + o if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): o + p for freq in ['H', '2H', '3H']: @@ -3538,14 +3578,14 @@ def test_add_offset_nat(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(3200, 's'), timedelta(hours=23, minutes=30)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p + o if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): o + p def test_sub_pdnat(self): @@ -3567,7 +3607,7 @@ def test_sub_offset(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p - o for freq in ['M', '2M', '3M']: @@ -3580,7 +3620,7 @@ def test_sub_offset(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p - o # freq is Tick @@ -3602,7 +3642,7 @@ def test_sub_offset(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(4, 'h'), timedelta(hours=23)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p - o for freq in ['H', '2H', '3H']: @@ -3623,7 +3663,7 @@ def test_sub_offset(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(3200, 's'), timedelta(hours=23, minutes=30)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p - o def test_sub_offset_nat(self): @@ -3636,7 +3676,7 @@ def test_sub_offset_nat(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p - o for freq in ['M', '2M', '3M']: @@ -3647,7 +3687,7 @@ def test_sub_offset_nat(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p - o # freq is Tick @@ -3661,7 +3701,7 @@ def test_sub_offset_nat(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(4, 'h'), timedelta(hours=23)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p - o for freq in ['H', '2H', '3H']: @@ -3674,7 +3714,7 @@ def test_sub_offset_nat(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(3200, 's'), timedelta(hours=23, minutes=30)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p - o def test_nat_ops(self): @@ -3683,77 +3723,153 @@ def test_nat_ops(self): self.assertEqual((p + 1).ordinal, tslib.iNaT) self.assertEqual((1 + p).ordinal, tslib.iNaT) self.assertEqual((p - 1).ordinal, tslib.iNaT) - self.assertEqual( - (p - Period('2011-01', freq=freq)).ordinal, tslib.iNaT) - self.assertEqual( - (Period('2011-01', freq=freq) - p).ordinal, tslib.iNaT) + self.assertEqual((p - Period('2011-01', freq=freq)).ordinal, + tslib.iNaT) + self.assertEqual((Period('2011-01', freq=freq) - p).ordinal, + tslib.iNaT) + + def test_period_ops_offset(self): + p = Period('2011-04-01', freq='D') + result = p + offsets.Day() + exp = pd.Period('2011-04-02', freq='D') + self.assertEqual(result, exp) - def test_pi_ops_nat(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', + result = p - offsets.Day(2) + exp = pd.Period('2011-03-30', freq='D') + self.assertEqual(result, exp) + + msg = "Input cannot be converted to Period\(freq=D\)" + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + p + offsets.Hour(2) + + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + p - offsets.Hour(2) + + +class TestPeriodIndexSeriesMethods(tm.TestCase): + """ Test PeriodIndex and Period Series Ops consistency """ + + def _check(self, values, func, expected): + idx = pd.PeriodIndex(values) + result = func(idx) + tm.assert_index_equal(result, pd.PeriodIndex(expected)) + + s = pd.Series(values) + result = func(s) + + exp = pd.Series(expected) + # Period(NaT) != Period(NaT) + + lmask = result.map(lambda x: x.ordinal != tslib.iNaT) + rmask = exp.map(lambda x: x.ordinal != tslib.iNaT) + tm.assert_series_equal(lmask, rmask) + tm.assert_series_equal(result[lmask], exp[rmask]) + + def test_pi_ops(self): + idx = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], freq='M', name='idx') - result = idx + 2 - expected = PeriodIndex( - ['2011-03', '2011-04', 'NaT', '2011-06'], freq='M', name='idx') - self.assertTrue(result.equals(expected)) - result2 = result - 2 - self.assertTrue(result2.equals(idx)) + expected = PeriodIndex(['2011-03', '2011-04', + '2011-05', '2011-06'], freq='M', name='idx') + self._check(idx, lambda x: x + 2, expected) + self._check(idx, lambda x: 2 + x, expected) + + self._check(idx + 2, lambda x: x - 2, idx) + result = idx - Period('2011-01', freq='M') + exp = pd.Index([0, 1, 2, 3], name='idx') + tm.assert_index_equal(result, exp) + + result = Period('2011-01', freq='M') - idx + exp = pd.Index([0, -1, -2, -3], name='idx') + tm.assert_index_equal(result, exp) + + def test_pi_ops_errors(self): + idx = PeriodIndex(['2011-01', '2011-02', '2011-03', + '2011-04'], freq='M', name='idx') + s = pd.Series(idx) msg = "unsupported operand type\(s\)" - with tm.assertRaisesRegexp(TypeError, msg): - idx + "str" + for obj in [idx, s]: + for ng in ["str", 1.5]: + with tm.assertRaisesRegexp(TypeError, msg): + obj + ng + + with tm.assertRaises(TypeError): + # error message differs between PY2 and 3 + ng + obj + + with tm.assertRaisesRegexp(TypeError, msg): + obj - ng + + def test_pi_ops_nat(self): + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', + '2011-04'], freq='M', name='idx') + expected = PeriodIndex(['2011-03', '2011-04', + 'NaT', '2011-06'], freq='M', name='idx') + self._check(idx, lambda x: x + 2, expected) + self._check(idx, lambda x: 2 + x, expected) - def test_pi_ops_array(self): + self._check(idx + 2, lambda x: x - 2, idx) + + def test_pi_ops_array_int(self): idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], freq='M', name='idx') - result = idx + np.array([1, 2, 3, 4]) + f = lambda x: x + np.array([1, 2, 3, 4]) exp = PeriodIndex(['2011-02', '2011-04', 'NaT', '2011-08'], freq='M', name='idx') - self.assert_index_equal(result, exp) + self._check(idx, f, exp) - result = np.add(idx, np.array([4, -1, 1, 2])) + f = lambda x: np.add(x, np.array([4, -1, 1, 2])) exp = PeriodIndex(['2011-05', '2011-01', 'NaT', '2011-06'], freq='M', name='idx') - self.assert_index_equal(result, exp) + self._check(idx, f, exp) - result = idx - np.array([1, 2, 3, 4]) + f = lambda x: x - np.array([1, 2, 3, 4]) exp = PeriodIndex(['2010-12', '2010-12', 'NaT', '2010-12'], freq='M', name='idx') - self.assert_index_equal(result, exp) + self._check(idx, f, exp) - result = np.subtract(idx, np.array([3, 2, 3, -2])) + f = lambda x: np.subtract(x, np.array([3, 2, 3, -2])) exp = PeriodIndex(['2010-10', '2010-12', 'NaT', '2011-06'], freq='M', name='idx') - self.assert_index_equal(result, exp) - - # incompatible freq - msg = "Input has different freq from PeriodIndex\(freq=M\)" - with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): - idx + np.array([np.timedelta64(1, 'D')] * 4) - - idx = PeriodIndex(['2011-01-01 09:00', '2011-01-01 10:00', 'NaT', - '2011-01-01 12:00'], freq='H', name='idx') - result = idx + np.array([np.timedelta64(1, 'D')] * 4) - exp = PeriodIndex(['2011-01-02 09:00', '2011-01-02 10:00', 'NaT', - '2011-01-02 12:00'], freq='H', name='idx') - self.assert_index_equal(result, exp) - - result = idx - np.array([np.timedelta64(1, 'h')] * 4) - exp = PeriodIndex(['2011-01-01 08:00', '2011-01-01 09:00', 'NaT', - '2011-01-01 11:00'], freq='H', name='idx') - self.assert_index_equal(result, exp) + self._check(idx, f, exp) + + def test_pi_ops_offset(self): + idx = PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01', + '2011-04-01'], freq='D', name='idx') + f = lambda x: x + offsets.Day() + exp = PeriodIndex(['2011-01-02', '2011-02-02', '2011-03-02', + '2011-04-02'], freq='D', name='idx') + self._check(idx, f, exp) + + f = lambda x: x + offsets.Day(2) + exp = PeriodIndex(['2011-01-03', '2011-02-03', '2011-03-03', + '2011-04-03'], freq='D', name='idx') + self._check(idx, f, exp) + + f = lambda x: x - offsets.Day(2) + exp = PeriodIndex(['2010-12-30', '2011-01-30', '2011-02-27', + '2011-03-30'], freq='D', name='idx') + self._check(idx, f, exp) + + def test_pi_offset_errors(self): + idx = PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01', + '2011-04-01'], freq='D', name='idx') + s = pd.Series(idx) + + # Series op is applied per Period instance, thus error is raised + # from Period + msg_idx = "Input has different freq from PeriodIndex\(freq=D\)" + msg_s = "Input cannot be converted to Period\(freq=D\)" + for obj, msg in [(idx, msg_idx), (s, msg_s)]: + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + obj + offsets.Hour(2) - msg = "Input has different freq from PeriodIndex\(freq=H\)" - with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): - idx + np.array([np.timedelta64(1, 's')] * 4) + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + offsets.Hour(2) + obj - idx = PeriodIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00', 'NaT', - '2011-01-01 12:00:00'], freq='S', name='idx') - result = idx + np.array([np.timedelta64(1, 'h'), np.timedelta64( - 30, 's'), np.timedelta64(2, 'h'), np.timedelta64(15, 'm')]) - exp = PeriodIndex(['2011-01-01 10:00:00', '2011-01-01 10:00:30', 'NaT', - '2011-01-01 12:15:00'], freq='S', name='idx') - self.assert_index_equal(result, exp) + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + obj - offsets.Hour(2) def test_pi_sub_period(self): # GH 13071 @@ -3871,7 +3987,7 @@ def test_equal(self): self.assertEqual(self.january1, self.january2) def test_equal_Raises_Value(self): - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): self.january1 == self.day def test_notEqual(self): @@ -3882,7 +3998,7 @@ def test_greater(self): self.assertTrue(self.february > self.january1) def test_greater_Raises_Value(self): - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): self.january1 > self.day def test_greater_Raises_Type(self): @@ -3893,8 +4009,9 @@ def test_greaterEqual(self): self.assertTrue(self.january1 >= self.january2) def test_greaterEqual_Raises_Value(self): - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): self.january1 >= self.day + with tm.assertRaises(TypeError): print(self.january1 >= 1) @@ -3902,7 +4019,7 @@ def test_smallerEqual(self): self.assertTrue(self.january1 <= self.january2) def test_smallerEqual_Raises_Value(self): - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): self.january1 <= self.day def test_smallerEqual_Raises_Type(self): @@ -3913,7 +4030,7 @@ def test_smaller(self): self.assertTrue(self.january1 < self.february) def test_smaller_Raises_Value(self): - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): self.january1 < self.day def test_smaller_Raises_Type(self): @@ -3950,24 +4067,30 @@ def test_pi_pi_comp(self): exp = np.array([False, True, False, False]) self.assert_numpy_array_equal(base == p, exp) + self.assert_numpy_array_equal(p == base, exp) exp = np.array([True, False, True, True]) self.assert_numpy_array_equal(base != p, exp) + self.assert_numpy_array_equal(p != base, exp) exp = np.array([False, False, True, True]) self.assert_numpy_array_equal(base > p, exp) + self.assert_numpy_array_equal(p < base, exp) exp = np.array([True, False, False, False]) self.assert_numpy_array_equal(base < p, exp) + self.assert_numpy_array_equal(p > base, exp) exp = np.array([False, True, True, True]) self.assert_numpy_array_equal(base >= p, exp) + self.assert_numpy_array_equal(p <= base, exp) exp = np.array([True, True, False, False]) self.assert_numpy_array_equal(base <= p, exp) + self.assert_numpy_array_equal(p >= base, exp) - idx = PeriodIndex( - ['2011-02', '2011-01', '2011-03', '2011-05'], freq=freq) + idx = PeriodIndex(['2011-02', '2011-01', '2011-03', + '2011-05'], freq=freq) exp = np.array([False, False, True, False]) self.assert_numpy_array_equal(base == idx, exp) @@ -3992,7 +4115,10 @@ def test_pi_pi_comp(self): with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): base <= Period('2011', freq='A') - with tm.assertRaisesRegexp(ValueError, msg): + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + Period('2011', freq='A') >= base + + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): idx = PeriodIndex(['2011', '2012', '2013', '2014'], freq='A') base <= idx @@ -4001,6 +4127,9 @@ def test_pi_pi_comp(self): with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): base <= Period('2011', freq='4M') + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + Period('2011', freq='4M') >= base + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): idx = PeriodIndex(['2011', '2012', '2013', '2014'], freq='4M') base <= idx @@ -4013,17 +4142,23 @@ def test_pi_nat_comp(self): result = idx1 > Period('2011-02', freq=freq) exp = np.array([False, False, False, True]) self.assert_numpy_array_equal(result, exp) + result = Period('2011-02', freq=freq) < idx1 + self.assert_numpy_array_equal(result, exp) result = idx1 == Period('NaT', freq=freq) exp = np.array([False, False, False, False]) self.assert_numpy_array_equal(result, exp) + result = Period('NaT', freq=freq) == idx1 + self.assert_numpy_array_equal(result, exp) result = idx1 != Period('NaT', freq=freq) exp = np.array([True, True, True, True]) self.assert_numpy_array_equal(result, exp) + result = Period('NaT', freq=freq) != idx1 + self.assert_numpy_array_equal(result, exp) - idx2 = PeriodIndex( - ['2011-02', '2011-01', '2011-04', 'NaT'], freq=freq) + idx2 = PeriodIndex(['2011-02', '2011-01', '2011-04', + 'NaT'], freq=freq) result = idx1 < idx2 exp = np.array([True, False, False, False]) self.assert_numpy_array_equal(result, exp) @@ -4044,11 +4179,12 @@ def test_pi_nat_comp(self): exp = np.array([False, False, True, False]) self.assert_numpy_array_equal(result, exp) - diff = PeriodIndex( - ['2011-02', '2011-01', '2011-04', 'NaT'], freq='4M') + diff = PeriodIndex(['2011-02', '2011-01', '2011-04', + 'NaT'], freq='4M') msg = "Input has different freq=4M from PeriodIndex" with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): idx1 > diff + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): idx1 == diff @@ -4089,19 +4225,19 @@ def test_constructor_cast_object(self): def test_series_comparison_scalars(self): val = pd.Period('2000-01-04', freq='D') result = self.series > val - expected = np.array([x > val for x in self.series]) - self.assert_numpy_array_equal(result, expected) + expected = pd.Series([x > val for x in self.series]) + tm.assert_series_equal(result, expected) val = self.series[5] result = self.series > val - expected = np.array([x > val for x in self.series]) - self.assert_numpy_array_equal(result, expected) + expected = pd.Series([x > val for x in self.series]) + tm.assert_series_equal(result, expected) def test_between(self): left, right = self.series[[2, 7]] result = self.series.between(left, right) expected = (self.series >= left) & (self.series <= right) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # --------------------------------------------------------------------- # NaT support @@ -4120,7 +4256,7 @@ def test_NaT_scalar(self): def test_NaT_cast(self): result = Series([np.nan]).astype('period[D]') expected = Series([NaT]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) """ def test_set_none_nan(self): @@ -4151,6 +4287,176 @@ def test_intercept_astype_object(self): result = df.values.squeeze() self.assertTrue((result[:, 0] == expected.values).all()) + def test_ops_series_timedelta(self): + # GH 13043 + s = pd.Series([pd.Period('2015-01-01', freq='D'), + pd.Period('2015-01-02', freq='D')], name='xxx') + self.assertEqual(s.dtype, object) + + exp = pd.Series([pd.Period('2015-01-02', freq='D'), + pd.Period('2015-01-03', freq='D')], name='xxx') + tm.assert_series_equal(s + pd.Timedelta('1 days'), exp) + tm.assert_series_equal(pd.Timedelta('1 days') + s, exp) + + tm.assert_series_equal(s + pd.tseries.offsets.Day(), exp) + tm.assert_series_equal(pd.tseries.offsets.Day() + s, exp) + + def test_ops_series_period(self): + # GH 13043 + s = pd.Series([pd.Period('2015-01-01', freq='D'), + pd.Period('2015-01-02', freq='D')], name='xxx') + self.assertEqual(s.dtype, object) + + p = pd.Period('2015-01-10', freq='D') + # dtype will be object because of original dtype + exp = pd.Series([9, 8], name='xxx', dtype=object) + tm.assert_series_equal(p - s, exp) + tm.assert_series_equal(s - p, -exp) + + s2 = pd.Series([pd.Period('2015-01-05', freq='D'), + pd.Period('2015-01-04', freq='D')], name='xxx') + self.assertEqual(s2.dtype, object) + + exp = pd.Series([4, 2], name='xxx', dtype=object) + tm.assert_series_equal(s2 - s, exp) + tm.assert_series_equal(s - s2, -exp) + + def test_comp_series_period_scalar(self): + # GH 13200 + for freq in ['M', '2M', '3M']: + base = Series([Period(x, freq=freq) for x in + ['2011-01', '2011-02', '2011-03', '2011-04']]) + p = Period('2011-02', freq=freq) + + exp = pd.Series([False, True, False, False]) + tm.assert_series_equal(base == p, exp) + tm.assert_series_equal(p == base, exp) + + exp = pd.Series([True, False, True, True]) + tm.assert_series_equal(base != p, exp) + tm.assert_series_equal(p != base, exp) + + exp = pd.Series([False, False, True, True]) + tm.assert_series_equal(base > p, exp) + tm.assert_series_equal(p < base, exp) + + exp = pd.Series([True, False, False, False]) + tm.assert_series_equal(base < p, exp) + tm.assert_series_equal(p > base, exp) + + exp = pd.Series([False, True, True, True]) + tm.assert_series_equal(base >= p, exp) + tm.assert_series_equal(p <= base, exp) + + exp = pd.Series([True, True, False, False]) + tm.assert_series_equal(base <= p, exp) + tm.assert_series_equal(p >= base, exp) + + # different base freq + msg = "Input has different freq=A-DEC from Period" + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + base <= Period('2011', freq='A') + + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + Period('2011', freq='A') >= base + + def test_comp_series_period_series(self): + # GH 13200 + for freq in ['M', '2M', '3M']: + base = Series([Period(x, freq=freq) for x in + ['2011-01', '2011-02', '2011-03', '2011-04']]) + + s = Series([Period(x, freq=freq) for x in + ['2011-02', '2011-01', '2011-03', '2011-05']]) + + exp = Series([False, False, True, False]) + tm.assert_series_equal(base == s, exp) + + exp = Series([True, True, False, True]) + tm.assert_series_equal(base != s, exp) + + exp = Series([False, True, False, False]) + tm.assert_series_equal(base > s, exp) + + exp = Series([True, False, False, True]) + tm.assert_series_equal(base < s, exp) + + exp = Series([False, True, True, False]) + tm.assert_series_equal(base >= s, exp) + + exp = Series([True, False, True, True]) + tm.assert_series_equal(base <= s, exp) + + s2 = Series([Period(x, freq='A') for x in + ['2011', '2011', '2011', '2011']]) + + # different base freq + msg = "Input has different freq=A-DEC from Period" + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + base <= s2 + + def test_comp_series_period_object(self): + # GH 13200 + base = Series([Period('2011', freq='A'), Period('2011-02', freq='M'), + Period('2013', freq='A'), Period('2011-04', freq='M')]) + + s = Series([Period('2012', freq='A'), Period('2011-01', freq='M'), + Period('2013', freq='A'), Period('2011-05', freq='M')]) + + exp = Series([False, False, True, False]) + tm.assert_series_equal(base == s, exp) + + exp = Series([True, True, False, True]) + tm.assert_series_equal(base != s, exp) + + exp = Series([False, True, False, False]) + tm.assert_series_equal(base > s, exp) + + exp = Series([True, False, False, True]) + tm.assert_series_equal(base < s, exp) + + exp = Series([False, True, True, False]) + tm.assert_series_equal(base >= s, exp) + + exp = Series([True, False, True, True]) + tm.assert_series_equal(base <= s, exp) + + def test_ops_frame_period(self): + # GH 13043 + df = pd.DataFrame({'A': [pd.Period('2015-01', freq='M'), + pd.Period('2015-02', freq='M')], + 'B': [pd.Period('2014-01', freq='M'), + pd.Period('2014-02', freq='M')]}) + self.assertEqual(df['A'].dtype, object) + self.assertEqual(df['B'].dtype, object) + + p = pd.Period('2015-03', freq='M') + # dtype will be object because of original dtype + exp = pd.DataFrame({'A': np.array([2, 1], dtype=object), + 'B': np.array([14, 13], dtype=object)}) + tm.assert_frame_equal(p - df, exp) + tm.assert_frame_equal(df - p, -exp) + + df2 = pd.DataFrame({'A': [pd.Period('2015-05', freq='M'), + pd.Period('2015-06', freq='M')], + 'B': [pd.Period('2015-05', freq='M'), + pd.Period('2015-06', freq='M')]}) + self.assertEqual(df2['A'].dtype, object) + self.assertEqual(df2['B'].dtype, object) + + exp = pd.DataFrame({'A': np.array([4, 4], dtype=object), + 'B': np.array([16, 16], dtype=object)}) + tm.assert_frame_equal(df2 - df, exp) + tm.assert_frame_equal(df - df2, -exp) + + +class TestPeriodField(tm.TestCase): + def test_get_period_field_raises_on_out_of_range(self): + self.assertRaises(ValueError, _period.get_period_field, -1, 0, 0) + + def test_get_period_field_array_raises_on_out_of_range(self): + self.assertRaises(ValueError, _period.get_period_field_arr, -1, + np.empty(1), 0) if __name__ == '__main__': import nose diff --git a/pandas/tseries/tests/test_plotting.py b/pandas/tseries/tests/test_plotting.py index 9fab9c0990ef0..2255f9fae73de 100644 --- a/pandas/tseries/tests/test_plotting.py +++ b/pandas/tseries/tests/test_plotting.py @@ -4,8 +4,6 @@ from pandas.compat import lrange, zip import numpy as np -from numpy.testing.decorators import slow - from pandas import Index, Series, DataFrame from pandas.tseries.index import date_range, bdate_range @@ -13,7 +11,7 @@ from pandas.tseries.period import period_range, Period, PeriodIndex from pandas.tseries.resample import DatetimeIndex -from pandas.util.testing import assert_series_equal, ensure_clean +from pandas.util.testing import assert_series_equal, ensure_clean, slow import pandas.util.testing as tm from pandas.tests.test_graphics import _skip_if_no_scipy_gaussian_kde @@ -76,6 +74,13 @@ def test_frame_inferred(self): df = DataFrame(np.random.randn(len(idx), 3), index=idx) _check_plot_works(df.plot) + def test_is_error_nozeroindex(self): + # GH11858 + i = np.array([1, 2, 3]) + a = DataFrame(i, index=i) + _check_plot_works(a.plot, xerr=a) + _check_plot_works(a.plot, yerr=a) + def test_nonnumeric_exclude(self): import matplotlib.pyplot as plt @@ -325,7 +330,7 @@ def test_dataframe(self): bts = DataFrame({'a': tm.makeTimeSeries()}) ax = bts.plot() idx = ax.get_lines()[0].get_xdata() - tm.assert_numpy_array_equal(bts.index.to_period(), PeriodIndex(idx)) + tm.assert_index_equal(bts.index.to_period(), PeriodIndex(idx)) @slow def test_axis_limits(self): @@ -1108,7 +1113,7 @@ def test_ax_plot(self): fig = plt.figure() ax = fig.add_subplot(111) lines = ax.plot(x, y, label='Y') - tm.assert_numpy_array_equal(DatetimeIndex(lines[0].get_xdata()), x) + tm.assert_index_equal(DatetimeIndex(lines[0].get_xdata()), x) @slow def test_mpl_nopandas(self): diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 77396c3e38c93..2236d20975eee 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -13,18 +13,20 @@ notnull, Timestamp) from pandas.compat import range, lrange, zip, product, OrderedDict from pandas.core.base import SpecificationError -from pandas.core.common import ABCSeries, ABCDataFrame +from pandas.core.common import (ABCSeries, ABCDataFrame, + UnsupportedFunctionCall) from pandas.core.groupby import DataError from pandas.tseries.frequencies import MONTHS, DAYS +from pandas.tseries.frequencies import to_offset from pandas.tseries.index import date_range from pandas.tseries.offsets import Minute, BDay from pandas.tseries.period import period_range, PeriodIndex, Period from pandas.tseries.resample import (DatetimeIndex, TimeGrouper, DatetimeIndexResampler) -from pandas.tseries.frequencies import to_offset from pandas.tseries.tdi import timedelta_range from pandas.util.testing import (assert_series_equal, assert_almost_equal, - assert_frame_equal) + assert_frame_equal, assert_index_equal) +from pandas._period import IncompatibleFrequency bday = BDay() @@ -577,6 +579,7 @@ class Base(object): base class for resampling testing, calling .create_series() generates a series of each index type """ + def create_index(self, *args, **kwargs): """ return the _index_factory created using the args, kwargs """ factory = self._index_factory() @@ -619,6 +622,75 @@ def test_resample_interpolate(self): df.resample('1T').asfreq().interpolate(), df.resample('1T').interpolate()) + def test_raises_on_non_datetimelike_index(self): + # this is a non datetimelike index + xp = DataFrame() + self.assertRaises(TypeError, lambda: xp.resample('A').mean()) + + def test_resample_empty_series(self): + # GH12771 & GH12868 + + s = self.create_series()[:0] + + for freq in ['M', 'D', 'H']: + # need to test for ohlc from GH13083 + methods = [method for method in resample_methods + if method != 'ohlc'] + for method in methods: + result = getattr(s.resample(freq), method)() + + expected = s.copy() + expected.index = s.index._shallow_copy(freq=freq) + assert_index_equal(result.index, expected.index) + self.assertEqual(result.index.freq, expected.index.freq) + + if (method == 'size' and + isinstance(result.index, PeriodIndex) and + freq in ['M', 'D']): + # GH12871 - TODO: name should propagate, but currently + # doesn't on lower / same frequency with PeriodIndex + assert_series_equal(result, expected, check_dtype=False, + check_names=False) + # this assert will break when fixed + self.assertTrue(result.name is None) + else: + assert_series_equal(result, expected, check_dtype=False) + + def test_resample_empty_dataframe(self): + # GH13212 + index = self.create_series().index[:0] + f = DataFrame(index=index) + + for freq in ['M', 'D', 'H']: + # count retains dimensions too + methods = downsample_methods + ['count'] + for method in methods: + result = getattr(f.resample(freq), method)() + + expected = f.copy() + expected.index = f.index._shallow_copy(freq=freq) + assert_index_equal(result.index, expected.index) + self.assertEqual(result.index.freq, expected.index.freq) + assert_frame_equal(result, expected, check_dtype=False) + + # test size for GH13212 (currently stays as df) + + def test_resample_empty_dtypes(self): + + # Empty series were sometimes causing a segfault (for the functions + # with Cython bounds-checking disabled) or an IndexError. We just run + # them to ensure they no longer do. (GH #10228) + for index in tm.all_timeseries_index_generator(0): + for dtype in (np.float, np.int, np.object, 'datetime64[ns]'): + for how in downsample_methods + upsample_methods: + empty_series = pd.Series([], index, dtype) + try: + getattr(empty_series.resample('d'), how)() + except DataError: + # Ignore these since some combinations are invalid + # (ex: doing mean with dtype of np.object) + pass + class TestDatetimeIndex(Base, tm.TestCase): _multiprocess_can_split_ = True @@ -746,6 +818,22 @@ def _ohlc(group): exc.args += ('how=%s' % arg,) raise + def test_numpy_compat(self): + # see gh-12811 + s = Series([1, 2, 3, 4, 5], index=date_range( + '20130101', periods=5, freq='s')) + r = s.resample('2s') + + msg = "numpy operations are not valid with resample" + + for func in ('min', 'max', 'sum', 'prod', + 'mean', 'var', 'std'): + tm.assertRaisesRegexp(UnsupportedFunctionCall, msg, + getattr(r, func), + func, 1, 2, 3) + tm.assertRaisesRegexp(UnsupportedFunctionCall, msg, + getattr(r, func), axis=1) + def test_resample_how_callables(self): # GH 7929 data = np.arange(5, dtype=np.int64) @@ -1330,7 +1418,7 @@ def test_resample_base(self): resampled = ts.resample('5min', base=2).mean() exp_rng = date_range('12/31/1999 23:57:00', '1/1/2000 01:57', freq='5min') - self.assertTrue(resampled.index.equals(exp_rng)) + self.assert_index_equal(resampled.index, exp_rng) def test_resample_base_with_timedeltaindex(self): @@ -1344,8 +1432,8 @@ def test_resample_base_with_timedeltaindex(self): exp_without_base = timedelta_range(start='0s', end='25s', freq='2s') exp_with_base = timedelta_range(start='5s', end='29s', freq='2s') - self.assertTrue(without_base.index.equals(exp_without_base)) - self.assertTrue(with_base.index.equals(exp_with_base)) + self.assert_index_equal(without_base.index, exp_without_base) + self.assert_index_equal(with_base.index, exp_with_base) def test_resample_categorical_data_with_timedeltaindex(self): # GH #12169 @@ -1376,7 +1464,7 @@ def test_resample_to_period_monthly_buglet(self): result = ts.resample('M', kind='period').mean() exp_index = period_range('Jan-2000', 'Dec-2000', freq='M') - self.assertTrue(result.index.equals(exp_index)) + self.assert_index_equal(result.index, exp_index) def test_period_with_agg(self): @@ -1391,39 +1479,6 @@ def test_period_with_agg(self): result = s2.resample('D').agg(lambda x: x.mean()) assert_series_equal(result, expected) - def test_resample_empty(self): - ts = _simple_ts('1/1/2000', '2/1/2000')[:0] - - result = ts.resample('A').mean() - self.assertEqual(len(result), 0) - self.assertEqual(result.index.freqstr, 'A-DEC') - - result = ts.resample('A', kind='period').mean() - self.assertEqual(len(result), 0) - self.assertEqual(result.index.freqstr, 'A-DEC') - - # this is a non datetimelike index - xp = DataFrame() - self.assertRaises(TypeError, lambda: xp.resample('A').mean()) - - # Empty series were sometimes causing a segfault (for the functions - # with Cython bounds-checking disabled) or an IndexError. We just run - # them to ensure they no longer do. (GH #10228) - for index in tm.all_timeseries_index_generator(0): - for dtype in (np.float, np.int, np.object, 'datetime64[ns]'): - for how in downsample_methods + upsample_methods: - empty_series = pd.Series([], index, dtype) - try: - getattr(empty_series.resample('d'), how)() - except DataError: - # Ignore these since some combinations are invalid - # (ex: doing mean with dtype of np.object) - pass - - # this should also tests nunique - # (IOW, use resample_methods) - # when GH12886 is closed - def test_resample_segfault(self): # GH 8573 # segfaulting in older versions @@ -1572,7 +1627,7 @@ def test_corner_cases(self): result = ts.resample('5t', closed='right', label='left').mean() ex_index = date_range('1999-12-31 23:55', periods=4, freq='5t') - self.assertTrue(result.index.equals(ex_index)) + self.assert_index_equal(result.index, ex_index) len0pts = _simple_pts('2007-01', '2010-05', freq='M')[:0] # it works @@ -1846,6 +1901,32 @@ def test_resmaple_dst_anchor(self): freq='D', tz='Europe/Paris')), 'D Frequency') + def test_resample_with_nat(self): + # GH 13020 + index = DatetimeIndex([pd.NaT, + '1970-01-01 00:00:00', + pd.NaT, + '1970-01-01 00:00:01', + '1970-01-01 00:00:02']) + frame = DataFrame([2, 3, 5, 7, 11], index=index) + + index_1s = DatetimeIndex(['1970-01-01 00:00:00', + '1970-01-01 00:00:01', + '1970-01-01 00:00:02']) + frame_1s = DataFrame([3, 7, 11], index=index_1s) + assert_frame_equal(frame.resample('1s').mean(), frame_1s) + + index_2s = DatetimeIndex(['1970-01-01 00:00:00', + '1970-01-01 00:00:02']) + frame_2s = DataFrame([5, 11], index=index_2s) + assert_frame_equal(frame.resample('2s').mean(), frame_2s) + + index_3s = DatetimeIndex(['1970-01-01 00:00:00']) + frame_3s = DataFrame([7], index=index_3s) + assert_frame_equal(frame.resample('3s').mean(), frame_3s) + + assert_frame_equal(frame.resample('60s').mean(), frame_3s) + class TestPeriodIndex(Base, tm.TestCase): _multiprocess_can_split_ = True @@ -2042,19 +2123,6 @@ def test_resample_basic(self): result2 = s.resample('T', kind='period').mean() assert_series_equal(result2, expected) - def test_resample_empty(self): - - # GH12771 & GH12868 - index = PeriodIndex(start='2000', periods=0, freq='D', name='idx') - s = Series(index=index) - - expected_index = PeriodIndex([], name='idx', freq='M') - expected = Series(index=expected_index) - - for method in resample_methods: - result = getattr(s.resample('M'), method)() - assert_series_equal(result, expected) - def test_resample_count(self): # GH12774 @@ -2078,6 +2146,12 @@ def test_resample_same_freq(self): result = getattr(series.resample('M'), method)() assert_series_equal(result, expected) + def test_resample_incompat_freq(self): + + with self.assertRaises(IncompatibleFrequency): + pd.Series(range(3), index=pd.period_range( + start='2000', periods=3, freq='M')).resample('W').mean() + def test_with_local_timezone_pytz(self): # GH5430 tm._skip_if_no_pytz() @@ -2317,7 +2391,7 @@ def test_closed_left_corner(self): ex_index = date_range(start='1/1/2012 9:30', freq='10min', periods=3) - self.assertTrue(result.index.equals(ex_index)) + self.assert_index_equal(result.index, ex_index) assert_series_equal(result, exp) def test_quarterly_resampling(self): @@ -2439,7 +2513,6 @@ def create_series(self): return Series(np.arange(len(i)), index=i, name='tdi') def test_asfreq_bug(self): - import datetime as dt df = DataFrame(data=[1, 3], index=[dt.timedelta(), dt.timedelta(minutes=3)]) @@ -2452,7 +2525,6 @@ def test_asfreq_bug(self): class TestResamplerGrouper(tm.TestCase): - def setUp(self): self.frame = DataFrame({'A': [1] * 20 + [2] * 12 + [3] * 8, 'B': np.arange(40)}, @@ -2519,6 +2591,25 @@ def test_getitem(self): result = g.resample('2s').mean().B assert_series_equal(result, expected) + def test_getitem_multiple(self): + + # GH 13174 + # multiple calls after selection causing an issue with aliasing + data = [{'id': 1, 'buyer': 'A'}, {'id': 2, 'buyer': 'B'}] + df = pd.DataFrame(data, index=pd.date_range('2016-01-01', periods=2)) + r = df.groupby('id').resample('1D') + result = r['buyer'].count() + expected = pd.Series([1, 1], + index=pd.MultiIndex.from_tuples( + [(1, pd.Timestamp('2016-01-01')), + (2, pd.Timestamp('2016-01-02'))], + names=['id', None]), + name='buyer') + assert_series_equal(result, expected) + + result = r['buyer'].count() + assert_series_equal(result, expected) + def test_methods(self): g = self.frame.groupby('A') r = g.resample('2s') @@ -2569,14 +2660,36 @@ def test_apply(self): def f(x): return x.resample('2s').sum() + result = r.apply(f) assert_frame_equal(result, expected) def f(x): return x.resample('2s').apply(lambda y: y.sum()) + result = g.apply(f) assert_frame_equal(result, expected) + def test_resample_groupby_with_label(self): + # GH 13235 + index = date_range('2000-01-01', freq='2D', periods=5) + df = DataFrame(index=index, + data={'col0': [0, 0, 1, 1, 2], 'col1': [1, 1, 1, 1, 1]} + ) + result = df.groupby('col0').resample('1W', label='left').sum() + + mi = [np.array([0, 0, 1, 2]), + pd.to_datetime(np.array(['1999-12-26', '2000-01-02', + '2000-01-02', '2000-01-02']) + ) + ] + mindex = pd.MultiIndex.from_arrays(mi, names=['col0', None]) + expected = DataFrame(data={'col0': [0, 0, 2, 2], 'col1': [1, 1, 2, 1]}, + index=mindex + ) + + assert_frame_equal(result, expected) + def test_consistency_with_window(self): # consistent return values with window @@ -2647,7 +2760,7 @@ def test_apply_iteration(self): # it works! result = grouped.apply(f) - self.assertTrue(result.index.equals(df.index)) + self.assert_index_equal(result.index, df.index) def test_panel_aggregation(self): ind = pd.date_range('1/1/2000', periods=100) diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index c764f34b697c1..10276137b42a1 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -16,7 +16,6 @@ from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type as ct from pandas.util.testing import (assert_series_equal, assert_frame_equal, assert_almost_equal, assert_index_equal) -from numpy.testing import assert_allclose from pandas.tseries.offsets import Day, Second import pandas.util.testing as tm from numpy.random import randn @@ -413,6 +412,38 @@ def test_ops_series(self): tm.assert_series_equal(expected, td * other) tm.assert_series_equal(expected, other * td) + def test_ops_series_object(self): + # GH 13043 + s = pd.Series([pd.Timestamp('2015-01-01', tz='US/Eastern'), + pd.Timestamp('2015-01-01', tz='Asia/Tokyo')], + name='xxx') + self.assertEqual(s.dtype, object) + + exp = pd.Series([pd.Timestamp('2015-01-02', tz='US/Eastern'), + pd.Timestamp('2015-01-02', tz='Asia/Tokyo')], + name='xxx') + tm.assert_series_equal(s + pd.Timedelta('1 days'), exp) + tm.assert_series_equal(pd.Timedelta('1 days') + s, exp) + + # object series & object series + s2 = pd.Series([pd.Timestamp('2015-01-03', tz='US/Eastern'), + pd.Timestamp('2015-01-05', tz='Asia/Tokyo')], + name='xxx') + self.assertEqual(s2.dtype, object) + exp = pd.Series([pd.Timedelta('2 days'), pd.Timedelta('4 days')], + name='xxx') + tm.assert_series_equal(s2 - s, exp) + tm.assert_series_equal(s - s2, -exp) + + s = pd.Series([pd.Timedelta('01:00:00'), pd.Timedelta('02:00:00')], + name='xxx', dtype=object) + self.assertEqual(s.dtype, object) + + exp = pd.Series([pd.Timedelta('01:30:00'), pd.Timedelta('02:30:00')], + name='xxx') + tm.assert_series_equal(s + pd.Timedelta('00:30:00'), exp) + tm.assert_series_equal(pd.Timedelta('00:30:00') + s, exp) + def test_compare_timedelta_series(self): # regresssion test for GH5963 s = pd.Series([timedelta(days=1), timedelta(days=2)]) @@ -1159,12 +1190,6 @@ def test_append_numpy_bug_1681(self): result = a.append(c) self.assertTrue((result['B'] == td).all()) - def test_astype(self): - rng = timedelta_range('1 days', periods=10) - - result = rng.astype('i8') - self.assert_numpy_array_equal(result, rng.asi8) - def test_fields(self): rng = timedelta_range('1 days, 10:11:12.100123456', periods=2, freq='s') @@ -1198,7 +1223,7 @@ def test_total_seconds(self): freq='s') expt = [1 * 86400 + 10 * 3600 + 11 * 60 + 12 + 100123456. / 1e9, 1 * 86400 + 10 * 3600 + 11 * 60 + 13 + 100123456. / 1e9] - assert_allclose(rng.total_seconds(), expt, atol=1e-10, rtol=0) + tm.assert_almost_equal(rng.total_seconds(), np.array(expt)) # test Series s = Series(rng) @@ -1213,14 +1238,14 @@ def test_total_seconds(self): # with both nat s = Series([np.nan, np.nan], dtype='timedelta64[ns]') - tm.assert_series_equal(s.dt.total_seconds(), Series( - [np.nan, np.nan], index=[0, 1])) + tm.assert_series_equal(s.dt.total_seconds(), + Series([np.nan, np.nan], index=[0, 1])) def test_total_seconds_scalar(self): # GH 10939 rng = Timedelta('1 days, 10:11:12.100123456') expt = 1 * 86400 + 10 * 3600 + 11 * 60 + 12 + 100123456. / 1e9 - assert_allclose(rng.total_seconds(), expt, atol=1e-10, rtol=0) + tm.assert_almost_equal(rng.total_seconds(), expt) rng = Timedelta(np.nan) self.assertTrue(np.isnan(rng.total_seconds())) @@ -1263,7 +1288,7 @@ def test_constructor(self): def test_constructor_coverage(self): rng = timedelta_range('1 days', periods=10.5) exp = timedelta_range('1 days', periods=10) - self.assertTrue(rng.equals(exp)) + self.assert_index_equal(rng, exp) self.assertRaises(ValueError, TimedeltaIndex, start='1 days', periods='foo', freq='D') @@ -1277,16 +1302,16 @@ def test_constructor_coverage(self): gen = (timedelta(i) for i in range(10)) result = TimedeltaIndex(gen) expected = TimedeltaIndex([timedelta(i) for i in range(10)]) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) # NumPy string array strings = np.array(['1 days', '2 days', '3 days']) result = TimedeltaIndex(strings) expected = to_timedelta([1, 2, 3], unit='d') - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) from_ints = TimedeltaIndex(expected.asi8) - self.assertTrue(from_ints.equals(expected)) + self.assert_index_equal(from_ints, expected) # non-conforming freq self.assertRaises(ValueError, TimedeltaIndex, @@ -1413,7 +1438,7 @@ def test_map(self): f = lambda x: x.days result = rng.map(f) - exp = [f(x) for x in rng] + exp = np.array([f(x) for x in rng], dtype=np.int64) self.assert_numpy_array_equal(result, exp) def test_misc_coverage(self): @@ -1434,7 +1459,7 @@ def test_union(self): i2 = timedelta_range('3day', periods=5) result = i1.union(i2) expected = timedelta_range('1day', periods=7) - self.assert_numpy_array_equal(result, expected) + self.assert_index_equal(result, expected) i1 = Int64Index(np.arange(0, 20, 2)) i2 = TimedeltaIndex(start='1 day', periods=10, freq='D') @@ -1446,10 +1471,10 @@ def test_union_coverage(self): idx = TimedeltaIndex(['3d', '1d', '2d']) ordered = TimedeltaIndex(idx.sort_values(), freq='infer') result = ordered.union(idx) - self.assertTrue(result.equals(ordered)) + self.assert_index_equal(result, ordered) result = ordered[:0].union(ordered) - self.assertTrue(result.equals(ordered)) + self.assert_index_equal(result, ordered) self.assertEqual(result.freq, ordered.freq) def test_union_bug_1730(self): @@ -1459,18 +1484,18 @@ def test_union_bug_1730(self): result = rng_a.union(rng_b) exp = TimedeltaIndex(sorted(set(list(rng_a)) | set(list(rng_b)))) - self.assertTrue(result.equals(exp)) + self.assert_index_equal(result, exp) def test_union_bug_1745(self): left = TimedeltaIndex(['1 day 15:19:49.695000']) - right = TimedeltaIndex( - ['2 day 13:04:21.322000', '1 day 15:27:24.873000', - '1 day 15:31:05.350000']) + right = TimedeltaIndex(['2 day 13:04:21.322000', + '1 day 15:27:24.873000', + '1 day 15:31:05.350000']) result = left.union(right) exp = TimedeltaIndex(sorted(set(list(left)) | set(list(right)))) - self.assertTrue(result.equals(exp)) + self.assert_index_equal(result, exp) def test_union_bug_4564(self): @@ -1479,7 +1504,7 @@ def test_union_bug_4564(self): result = left.union(right) exp = TimedeltaIndex(sorted(set(list(left)) | set(list(right)))) - self.assertTrue(result.equals(exp)) + self.assert_index_equal(result, exp) def test_intersection_bug_1708(self): index_1 = timedelta_range('1 day', periods=4, freq='h') @@ -1501,7 +1526,7 @@ def test_get_duplicates(self): result = idx.get_duplicates() ex = TimedeltaIndex(['2 day', '3day']) - self.assertTrue(result.equals(ex)) + self.assert_index_equal(result, ex) def test_argmin_argmax(self): idx = TimedeltaIndex(['1 day 00:00:05', '1 day 00:00:01', @@ -1521,11 +1546,13 @@ def test_sort_values(self): ordered, dexer = idx.sort_values(return_indexer=True) self.assertTrue(ordered.is_monotonic) - self.assert_numpy_array_equal(dexer, [1, 2, 0]) + self.assert_numpy_array_equal(dexer, + np.array([1, 2, 0], dtype=np.int64)) ordered, dexer = idx.sort_values(return_indexer=True, ascending=False) self.assertTrue(ordered[::-1].is_monotonic) - self.assert_numpy_array_equal(dexer, [0, 2, 1]) + self.assert_numpy_array_equal(dexer, + np.array([0, 2, 1], dtype=np.int64)) def test_insert(self): @@ -1533,7 +1560,7 @@ def test_insert(self): result = idx.insert(2, timedelta(days=5)) exp = TimedeltaIndex(['4day', '1day', '5day', '2day'], name='idx') - self.assertTrue(result.equals(exp)) + self.assert_index_equal(result, exp) # insertion of non-datetime should coerce to object index result = idx.insert(1, 'inserted') @@ -1569,7 +1596,7 @@ def test_insert(self): for n, d, expected in cases: result = idx.insert(n, d) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) self.assertEqual(result.freq, expected.freq) @@ -1593,7 +1620,7 @@ def test_delete(self): 1: expected_1} for n, expected in compat.iteritems(cases): result = idx.delete(n) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) self.assertEqual(result.freq, expected.freq) @@ -1620,12 +1647,12 @@ def test_delete_slice(self): (3, 4, 5): expected_3_5} for n, expected in compat.iteritems(cases): result = idx.delete(n) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) self.assertEqual(result.freq, expected.freq) result = idx.delete(slice(n[0], n[-1] + 1)) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) self.assertEqual(result.freq, expected.freq) @@ -1639,7 +1666,7 @@ def test_take(self): taken2 = idx[[2, 4, 10]] for taken in [taken1, taken2]: - self.assertTrue(taken.equals(expected)) + self.assert_index_equal(taken, expected) tm.assertIsInstance(taken, TimedeltaIndex) self.assertIsNone(taken.freq) self.assertEqual(taken.name, expected.name) @@ -1686,7 +1713,7 @@ def test_isin(self): self.assertTrue(result.all()) assert_almost_equal(index.isin([index[2], 5]), - [False, False, True, False]) + np.array([False, False, True, False])) def test_does_not_convert_mixed_integer(self): df = tm.makeCustomDataframe(10, 10, @@ -1723,18 +1750,18 @@ def test_factorize(self): arr, idx = idx1.factorize() self.assert_numpy_array_equal(arr, exp_arr) - self.assertTrue(idx.equals(exp_idx)) + self.assert_index_equal(idx, exp_idx) arr, idx = idx1.factorize(sort=True) self.assert_numpy_array_equal(arr, exp_arr) - self.assertTrue(idx.equals(exp_idx)) + self.assert_index_equal(idx, exp_idx) # freq must be preserved idx3 = timedelta_range('1 day', periods=4, freq='s') exp_arr = np.array([0, 1, 2, 3]) arr, idx = idx3.factorize() self.assert_numpy_array_equal(arr, exp_arr) - self.assertTrue(idx.equals(idx3)) + self.assert_index_equal(idx, idx3) class TestSlicing(tm.TestCase): diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 3d8e389ba30f2..f6d80f7ee410b 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -5,7 +5,6 @@ import warnings from datetime import datetime, time, timedelta from numpy.random import rand -from numpy.testing.decorators import slow import nose import numpy as np @@ -31,7 +30,7 @@ from pandas.tslib import iNaT from pandas.util.testing import ( assert_frame_equal, assert_series_equal, assert_almost_equal, - _skip_if_has_locale) + _skip_if_has_locale, slow) randn = np.random.randn @@ -60,7 +59,7 @@ def test_index_unique(self): expected = DatetimeIndex([datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 4), datetime(2000, 1, 5)]) self.assertEqual(uniques.dtype, 'M8[ns]') # sanity - self.assertTrue(uniques.equals(expected)) + tm.assert_index_equal(uniques, expected) self.assertEqual(self.dups.index.nunique(), 4) # #2563 @@ -69,22 +68,23 @@ def test_index_unique(self): dups_local = self.dups.index.tz_localize('US/Eastern') dups_local.name = 'foo' result = dups_local.unique() - expected = DatetimeIndex(expected).tz_localize('US/Eastern') + expected = DatetimeIndex(expected, name='foo') + expected = expected.tz_localize('US/Eastern') self.assertTrue(result.tz is not None) self.assertEqual(result.name, 'foo') - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) # NaT, note this is excluded arr = [1370745748 + t for t in range(20)] + [iNaT] idx = DatetimeIndex(arr * 3) - self.assertTrue(idx.unique().equals(DatetimeIndex(arr))) + tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) self.assertEqual(idx.nunique(), 20) self.assertEqual(idx.nunique(dropna=False), 21) arr = [Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for t in range(20)] + [NaT] idx = DatetimeIndex(arr * 3) - self.assertTrue(idx.unique().equals(DatetimeIndex(arr))) + tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) self.assertEqual(idx.nunique(), 20) self.assertEqual(idx.nunique(dropna=False), 21) @@ -285,12 +285,12 @@ def test_recreate_from_data(self): for f in freqs: org = DatetimeIndex(start='2001/02/01 09:00', freq=f, periods=1) idx = DatetimeIndex(org, freq=f) - self.assertTrue(idx.equals(org)) + tm.assert_index_equal(idx, org) org = DatetimeIndex(start='2001/02/01 09:00', freq=f, tz='US/Pacific', periods=1) idx = DatetimeIndex(org, freq=f, tz='US/Pacific') - self.assertTrue(idx.equals(org)) + tm.assert_index_equal(idx, org) def assert_range_equal(left, right): @@ -762,6 +762,15 @@ def test_to_datetime_unit(self): with self.assertRaises(ValueError): to_datetime([1, 2, 111111111], unit='D') + # coerce we can process + expected = DatetimeIndex([Timestamp('1970-01-02'), + Timestamp('1970-01-03')] + ['NaT'] * 1) + result = to_datetime([1, 2, 'foo'], unit='D', errors='coerce') + tm.assert_index_equal(result, expected) + + result = to_datetime([1, 2, 111111111], unit='D', errors='coerce') + tm.assert_index_equal(result, expected) + def test_series_ctor_datetime64(self): rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s') dates = np.asarray(rng) @@ -866,7 +875,7 @@ def test_string_na_nat_conversion(self): result2 = to_datetime(strings) tm.assertIsInstance(result2, DatetimeIndex) - tm.assert_numpy_array_equal(result, result2) + tm.assert_numpy_array_equal(result, result2.values) malformed = np.array(['1/100/2000', np.nan], dtype=object) @@ -1057,7 +1066,7 @@ def test_to_datetime_list_of_integers(self): result = DatetimeIndex(ints) - self.assertTrue(rng.equals(result)) + tm.assert_index_equal(rng, result) def test_to_datetime_freq(self): xp = bdate_range('2000-1-1', periods=10, tz='UTC') @@ -1101,8 +1110,8 @@ def test_asfreq_keep_index_name(self): index = pd.date_range('20130101', periods=20, name=index_name) df = pd.DataFrame([x for x in range(20)], columns=['foo'], index=index) - tm.assert_equal(index_name, df.index.name) - tm.assert_equal(index_name, df.asfreq('10D').index.name) + self.assertEqual(index_name, df.index.name) + self.assertEqual(index_name, df.asfreq('10D').index.name) def test_promote_datetime_date(self): rng = date_range('1/1/2000', periods=20) @@ -1154,15 +1163,15 @@ def test_date_range_gen_error(self): def test_date_range_negative_freq(self): # GH 11018 rng = date_range('2011-12-31', freq='-2A', periods=3) - exp = pd.DatetimeIndex( - ['2011-12-31', '2009-12-31', '2007-12-31'], freq='-2A') - self.assert_index_equal(rng, exp) + exp = pd.DatetimeIndex(['2011-12-31', '2009-12-31', + '2007-12-31'], freq='-2A') + tm.assert_index_equal(rng, exp) self.assertEqual(rng.freq, '-2A') rng = date_range('2011-01-31', freq='-2M', periods=3) - exp = pd.DatetimeIndex( - ['2011-01-31', '2010-11-30', '2010-09-30'], freq='-2M') - self.assert_index_equal(rng, exp) + exp = pd.DatetimeIndex(['2011-01-31', '2010-11-30', + '2010-09-30'], freq='-2M') + tm.assert_index_equal(rng, exp) self.assertEqual(rng.freq, '-2M') def test_date_range_bms_bug(self): @@ -1515,7 +1524,7 @@ def test_normalize(self): result = rng.normalize() expected = date_range('1/1/2000', periods=10, freq='D') - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) rng_ns = pd.DatetimeIndex(np.array([1380585623454345752, 1380585612343234312]).astype( @@ -1524,7 +1533,7 @@ def test_normalize(self): expected = pd.DatetimeIndex(np.array([1380585600000000000, 1380585600000000000]).astype( "datetime64[ns]")) - self.assertTrue(rng_ns_normalized.equals(expected)) + tm.assert_index_equal(rng_ns_normalized, expected) self.assertTrue(result.is_normalized) self.assertFalse(rng.is_normalized) @@ -1541,7 +1550,7 @@ def test_to_period(self): pts = ts.to_period('M') exp.index = exp.index.asfreq('M') - self.assertTrue(pts.index.equals(exp.index.asfreq('M'))) + tm.assert_index_equal(pts.index, exp.index.asfreq('M')) assert_series_equal(pts, exp) # GH 7606 without freq @@ -1599,7 +1608,7 @@ def test_to_period_tz_pytz(self): expected = ts[0].to_period() self.assertEqual(result, expected) - self.assertTrue(ts.to_period().equals(xp)) + tm.assert_index_equal(ts.to_period(), xp) ts = date_range('1/1/2000', '4/1/2000', tz=UTC) @@ -1607,7 +1616,7 @@ def test_to_period_tz_pytz(self): expected = ts[0].to_period() self.assertEqual(result, expected) - self.assertTrue(ts.to_period().equals(xp)) + tm.assert_index_equal(ts.to_period(), xp) ts = date_range('1/1/2000', '4/1/2000', tz=tzlocal()) @@ -1615,7 +1624,7 @@ def test_to_period_tz_pytz(self): expected = ts[0].to_period() self.assertEqual(result, expected) - self.assertTrue(ts.to_period().equals(xp)) + tm.assert_index_equal(ts.to_period(), xp) def test_to_period_tz_explicit_pytz(self): tm._skip_if_no_pytz() @@ -1630,7 +1639,7 @@ def test_to_period_tz_explicit_pytz(self): expected = ts[0].to_period() self.assertTrue(result == expected) - self.assertTrue(ts.to_period().equals(xp)) + tm.assert_index_equal(ts.to_period(), xp) ts = date_range('1/1/2000', '4/1/2000', tz=pytz.utc) @@ -1638,7 +1647,7 @@ def test_to_period_tz_explicit_pytz(self): expected = ts[0].to_period() self.assertTrue(result == expected) - self.assertTrue(ts.to_period().equals(xp)) + tm.assert_index_equal(ts.to_period(), xp) ts = date_range('1/1/2000', '4/1/2000', tz=tzlocal()) @@ -1646,7 +1655,7 @@ def test_to_period_tz_explicit_pytz(self): expected = ts[0].to_period() self.assertTrue(result == expected) - self.assertTrue(ts.to_period().equals(xp)) + tm.assert_index_equal(ts.to_period(), xp) def test_to_period_tz_dateutil(self): tm._skip_if_no_dateutil() @@ -1661,7 +1670,7 @@ def test_to_period_tz_dateutil(self): expected = ts[0].to_period() self.assertTrue(result == expected) - self.assertTrue(ts.to_period().equals(xp)) + tm.assert_index_equal(ts.to_period(), xp) ts = date_range('1/1/2000', '4/1/2000', tz=dateutil.tz.tzutc()) @@ -1669,7 +1678,7 @@ def test_to_period_tz_dateutil(self): expected = ts[0].to_period() self.assertTrue(result == expected) - self.assertTrue(ts.to_period().equals(xp)) + tm.assert_index_equal(ts.to_period(), xp) ts = date_range('1/1/2000', '4/1/2000', tz=tzlocal()) @@ -1677,7 +1686,7 @@ def test_to_period_tz_dateutil(self): expected = ts[0].to_period() self.assertTrue(result == expected) - self.assertTrue(ts.to_period().equals(xp)) + tm.assert_index_equal(ts.to_period(), xp) def test_frame_to_period(self): K = 5 @@ -1694,7 +1703,7 @@ def test_frame_to_period(self): assert_frame_equal(pts, exp) pts = df.to_period('M') - self.assertTrue(pts.index.equals(exp.index.asfreq('M'))) + tm.assert_index_equal(pts.index, exp.index.asfreq('M')) df = df.T pts = df.to_period(axis=1) @@ -1703,7 +1712,7 @@ def test_frame_to_period(self): assert_frame_equal(pts, exp) pts = df.to_period('M', axis=1) - self.assertTrue(pts.columns.equals(exp.columns.asfreq('M'))) + tm.assert_index_equal(pts.columns, exp.columns.asfreq('M')) self.assertRaises(ValueError, df.to_period, axis=2) @@ -1791,11 +1800,11 @@ def test_datetimeindex_integers_shift(self): result = rng + 5 expected = rng.shift(5) - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) result = rng - 5 expected = rng.shift(-5) - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) def test_astype_object(self): # NumPy 1.6.1 weak ns support @@ -1804,7 +1813,8 @@ def test_astype_object(self): casted = rng.astype('O') exp_values = list(rng) - self.assert_numpy_array_equal(casted, exp_values) + tm.assert_index_equal(casted, Index(exp_values, dtype=np.object_)) + self.assertEqual(casted.tolist(), exp_values) def test_catch_infinite_loop(self): offset = datetools.DateOffset(minute=5) @@ -1820,15 +1830,15 @@ def test_append_concat(self): result = ts.append(ts) result_df = df.append(df) ex_index = DatetimeIndex(np.tile(rng.values, 2)) - self.assertTrue(result.index.equals(ex_index)) - self.assertTrue(result_df.index.equals(ex_index)) + tm.assert_index_equal(result.index, ex_index) + tm.assert_index_equal(result_df.index, ex_index) appended = rng.append(rng) - self.assertTrue(appended.equals(ex_index)) + tm.assert_index_equal(appended, ex_index) appended = rng.append([rng, rng]) ex_index = DatetimeIndex(np.tile(rng.values, 3)) - self.assertTrue(appended.equals(ex_index)) + tm.assert_index_equal(appended, ex_index) # different index names rng1 = rng.copy() @@ -1855,11 +1865,11 @@ def test_append_concat_tz(self): result = ts.append(ts2) result_df = df.append(df2) - self.assertTrue(result.index.equals(rng3)) - self.assertTrue(result_df.index.equals(rng3)) + tm.assert_index_equal(result.index, rng3) + tm.assert_index_equal(result_df.index, rng3) appended = rng.append(rng2) - self.assertTrue(appended.equals(rng3)) + tm.assert_index_equal(appended, rng3) def test_append_concat_tz_explicit_pytz(self): # GH 2938 @@ -1879,11 +1889,11 @@ def test_append_concat_tz_explicit_pytz(self): result = ts.append(ts2) result_df = df.append(df2) - self.assertTrue(result.index.equals(rng3)) - self.assertTrue(result_df.index.equals(rng3)) + tm.assert_index_equal(result.index, rng3) + tm.assert_index_equal(result_df.index, rng3) appended = rng.append(rng2) - self.assertTrue(appended.equals(rng3)) + tm.assert_index_equal(appended, rng3) def test_append_concat_tz_dateutil(self): # GH 2938 @@ -1901,11 +1911,11 @@ def test_append_concat_tz_dateutil(self): result = ts.append(ts2) result_df = df.append(df2) - self.assertTrue(result.index.equals(rng3)) - self.assertTrue(result_df.index.equals(rng3)) + tm.assert_index_equal(result.index, rng3) + tm.assert_index_equal(result_df.index, rng3) appended = rng.append(rng2) - self.assertTrue(appended.equals(rng3)) + tm.assert_index_equal(appended, rng3) def test_set_dataframe_column_ns_dtype(self): x = DataFrame([datetime.now(), datetime.now()]) @@ -2283,18 +2293,162 @@ def test_to_datetime_tz_psycopg2(self): dtype='datetime64[ns, UTC]') tm.assert_index_equal(result, expected) + def test_datetime_bool(self): + # GH13176 + with self.assertRaises(TypeError): + to_datetime(False) + self.assertTrue(to_datetime(False, errors="coerce") is tslib.NaT) + self.assertEqual(to_datetime(False, errors="ignore"), False) + with self.assertRaises(TypeError): + to_datetime(True) + self.assertTrue(to_datetime(True, errors="coerce") is tslib.NaT) + self.assertEqual(to_datetime(True, errors="ignore"), True) + with self.assertRaises(TypeError): + to_datetime([False, datetime.today()]) + with self.assertRaises(TypeError): + to_datetime(['20130101', True]) + tm.assert_index_equal(to_datetime([0, False, tslib.NaT, 0.0], + errors="coerce"), + DatetimeIndex([to_datetime(0), tslib.NaT, + tslib.NaT, to_datetime(0)])) + + def test_datetime_invalid_datatype(self): + # GH13176 + + with self.assertRaises(TypeError): + pd.to_datetime(bool) + with self.assertRaises(TypeError): + pd.to_datetime(pd.to_datetime) + + def test_unit(self): + # GH 11758 + # test proper behavior with erros + + with self.assertRaises(ValueError): + to_datetime([1], unit='D', format='%Y%m%d') + + values = [11111111, 1, 1.0, tslib.iNaT, pd.NaT, np.nan, + 'NaT', ''] + result = to_datetime(values, unit='D', errors='ignore') + expected = Index([11111111, Timestamp('1970-01-02'), + Timestamp('1970-01-02'), pd.NaT, + pd.NaT, pd.NaT, pd.NaT, pd.NaT], + dtype=object) + tm.assert_index_equal(result, expected) + + result = to_datetime(values, unit='D', errors='coerce') + expected = DatetimeIndex(['NaT', '1970-01-02', '1970-01-02', + 'NaT', 'NaT', 'NaT', 'NaT', 'NaT']) + tm.assert_index_equal(result, expected) + + with self.assertRaises(tslib.OutOfBoundsDatetime): + to_datetime(values, unit='D', errors='raise') + + values = [1420043460000, tslib.iNaT, pd.NaT, np.nan, 'NaT'] + + result = to_datetime(values, errors='ignore', unit='s') + expected = Index([1420043460000, pd.NaT, pd.NaT, + pd.NaT, pd.NaT], dtype=object) + tm.assert_index_equal(result, expected) + + result = to_datetime(values, errors='coerce', unit='s') + expected = DatetimeIndex(['NaT', 'NaT', 'NaT', 'NaT', 'NaT']) + tm.assert_index_equal(result, expected) + + with self.assertRaises(tslib.OutOfBoundsDatetime): + to_datetime(values, errors='raise', unit='s') + + # if we have a string, then we raise a ValueError + # and NOT an OutOfBoundsDatetime + for val in ['foo', Timestamp('20130101')]: + try: + to_datetime(val, errors='raise', unit='s') + except tslib.OutOfBoundsDatetime: + raise AssertionError("incorrect exception raised") + except ValueError: + pass + + def test_unit_consistency(self): + + # consistency of conversions + expected = Timestamp('1970-05-09 14:25:11') + result = pd.to_datetime(11111111, unit='s', errors='raise') + self.assertEqual(result, expected) + self.assertIsInstance(result, Timestamp) + + result = pd.to_datetime(11111111, unit='s', errors='coerce') + self.assertEqual(result, expected) + self.assertIsInstance(result, Timestamp) + + result = pd.to_datetime(11111111, unit='s', errors='ignore') + self.assertEqual(result, expected) + self.assertIsInstance(result, Timestamp) + + def test_unit_with_numeric(self): + + # GH 13180 + # coercions from floats/ints are ok + expected = DatetimeIndex(['2015-06-19 05:33:20', + '2015-05-27 22:33:20']) + arr1 = [1.434692e+18, 1.432766e+18] + arr2 = np.array(arr1).astype('int64') + for errors in ['ignore', 'raise', 'coerce']: + result = pd.to_datetime(arr1, errors=errors) + tm.assert_index_equal(result, expected) + + result = pd.to_datetime(arr2, errors=errors) + tm.assert_index_equal(result, expected) + + # but we want to make sure that we are coercing + # if we have ints/strings + expected = DatetimeIndex(['NaT', + '2015-06-19 05:33:20', + '2015-05-27 22:33:20']) + arr = ['foo', 1.434692e+18, 1.432766e+18] + result = pd.to_datetime(arr, errors='coerce') + tm.assert_index_equal(result, expected) + + expected = DatetimeIndex(['2015-06-19 05:33:20', + '2015-05-27 22:33:20', + 'NaT', + 'NaT']) + arr = [1.434692e+18, 1.432766e+18, 'foo', 'NaT'] + result = pd.to_datetime(arr, errors='coerce') + tm.assert_index_equal(result, expected) + + def test_unit_mixed(self): + + # mixed integers/datetimes + expected = DatetimeIndex(['2013-01-01', 'NaT', 'NaT']) + arr = [pd.Timestamp('20130101'), 1.434692e+18, 1.432766e+18] + result = pd.to_datetime(arr, errors='coerce') + tm.assert_index_equal(result, expected) + + with self.assertRaises(ValueError): + pd.to_datetime(arr, errors='raise') + + expected = DatetimeIndex(['NaT', + 'NaT', + '2013-01-01']) + arr = [1.434692e+18, 1.432766e+18, pd.Timestamp('20130101')] + result = pd.to_datetime(arr, errors='coerce') + tm.assert_index_equal(result, expected) + + with self.assertRaises(ValueError): + pd.to_datetime(arr, errors='raise') + def test_index_to_datetime(self): idx = Index(['1/1/2000', '1/2/2000', '1/3/2000']) result = idx.to_datetime() expected = DatetimeIndex(datetools.to_datetime(idx.values)) - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) today = datetime.today() idx = Index([today], dtype=object) result = idx.to_datetime() expected = DatetimeIndex([today]) - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) def test_dataframe(self): @@ -2437,34 +2591,6 @@ def test_append_join_nondatetimeindex(self): # it works rng.join(idx, how='outer') - def test_astype(self): - rng = date_range('1/1/2000', periods=10) - - result = rng.astype('i8') - self.assert_numpy_array_equal(result, rng.asi8) - - # with tz - rng = date_range('1/1/2000', periods=10, tz='US/Eastern') - result = rng.astype('datetime64[ns]') - expected = (date_range('1/1/2000', periods=10, - tz='US/Eastern') - .tz_convert('UTC').tz_localize(None)) - tm.assert_index_equal(result, expected) - - # BUG#10442 : testing astype(str) is correct for Series/DatetimeIndex - result = pd.Series(pd.date_range('2012-01-01', periods=3)).astype(str) - expected = pd.Series( - ['2012-01-01', '2012-01-02', '2012-01-03'], dtype=object) - tm.assert_series_equal(result, expected) - - result = Series(pd.date_range('2012-01-01', periods=3, - tz='US/Eastern')).astype(str) - expected = Series(['2012-01-01 00:00:00-05:00', - '2012-01-02 00:00:00-05:00', - '2012-01-03 00:00:00-05:00'], - dtype=object) - tm.assert_series_equal(result, expected) - def test_to_period_nofreq(self): idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04']) self.assertRaises(ValueError, idx.to_period) @@ -2472,14 +2598,14 @@ def test_to_period_nofreq(self): idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03'], freq='infer') self.assertEqual(idx.freqstr, 'D') - expected = pd.PeriodIndex( - ['2000-01-01', '2000-01-02', '2000-01-03'], freq='D') - self.assertTrue(idx.to_period().equals(expected)) + expected = pd.PeriodIndex(['2000-01-01', '2000-01-02', + '2000-01-03'], freq='D') + tm.assert_index_equal(idx.to_period(), expected) # GH 7606 idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03']) self.assertEqual(idx.freqstr, None) - self.assertTrue(idx.to_period().equals(expected)) + tm.assert_index_equal(idx.to_period(), expected) def test_000constructor_resolution(self): # 2252 @@ -2491,7 +2617,7 @@ def test_000constructor_resolution(self): def test_constructor_coverage(self): rng = date_range('1/1/2000', periods=10.5) exp = date_range('1/1/2000', periods=10) - self.assertTrue(rng.equals(exp)) + tm.assert_index_equal(rng, exp) self.assertRaises(ValueError, DatetimeIndex, start='1/1/2000', periods='foo', freq='D') @@ -2506,25 +2632,25 @@ def test_constructor_coverage(self): result = DatetimeIndex(gen) expected = DatetimeIndex([datetime(2000, 1, 1) + timedelta(i) for i in range(10)]) - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) # NumPy string array strings = np.array(['2000-01-01', '2000-01-02', '2000-01-03']) result = DatetimeIndex(strings) expected = DatetimeIndex(strings.astype('O')) - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) from_ints = DatetimeIndex(expected.asi8) - self.assertTrue(from_ints.equals(expected)) + tm.assert_index_equal(from_ints, expected) # string with NaT strings = np.array(['2000-01-01', '2000-01-02', 'NaT']) result = DatetimeIndex(strings) expected = DatetimeIndex(strings.astype('O')) - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) from_ints = DatetimeIndex(expected.asi8) - self.assertTrue(from_ints.equals(expected)) + tm.assert_index_equal(from_ints, expected) # non-conforming self.assertRaises(ValueError, DatetimeIndex, @@ -2591,17 +2717,15 @@ def test_constructor_datetime64_tzformat(self): def test_constructor_dtype(self): # passing a dtype with a tz should localize - idx = DatetimeIndex(['2013-01-01', - '2013-01-02'], + idx = DatetimeIndex(['2013-01-01', '2013-01-02'], dtype='datetime64[ns, US/Eastern]') expected = DatetimeIndex(['2013-01-01', '2013-01-02'] ).tz_localize('US/Eastern') - self.assertTrue(idx.equals(expected)) + tm.assert_index_equal(idx, expected) - idx = DatetimeIndex(['2013-01-01', - '2013-01-02'], + idx = DatetimeIndex(['2013-01-01', '2013-01-02'], tz='US/Eastern') - self.assertTrue(idx.equals(expected)) + tm.assert_index_equal(idx, expected) # if we already have a tz and its not the same, then raise idx = DatetimeIndex(['2013-01-01', '2013-01-02'], @@ -2620,7 +2744,7 @@ def test_constructor_dtype(self): idx, tz='CET', dtype='datetime64[ns, US/Eastern]')) result = DatetimeIndex(idx, dtype='datetime64[ns, US/Eastern]') - self.assertTrue(idx.equals(result)) + tm.assert_index_equal(idx, result) def test_constructor_name(self): idx = DatetimeIndex(start='2000-01-01', periods=1, freq='A', @@ -2736,7 +2860,7 @@ def test_map(self): f = lambda x: x.strftime('%Y%m%d') result = rng.map(f) - exp = [f(x) for x in rng] + exp = np.array([f(x) for x in rng], dtype=' val - expected = np.array([x > val for x in self.series]) - self.assert_numpy_array_equal(result, expected) + expected = Series([x > val for x in self.series]) + self.assert_series_equal(result, expected) val = self.series[5] result = self.series > val - expected = np.array([x > val for x in self.series]) - self.assert_numpy_array_equal(result, expected) + expected = Series([x > val for x in self.series]) + self.assert_series_equal(result, expected) def test_between(self): left, right = self.series[[2, 7]] @@ -4229,68 +4355,6 @@ def check(val, unit=None, h=1, s=1, us=0): result = Timestamp('NaT') self.assertIs(result, NaT) - def test_unit_errors(self): - # GH 11758 - # test proper behavior with erros - - with self.assertRaises(ValueError): - to_datetime([1], unit='D', format='%Y%m%d') - - values = [11111111, 1, 1.0, tslib.iNaT, pd.NaT, np.nan, - 'NaT', ''] - result = to_datetime(values, unit='D', errors='ignore') - expected = Index([11111111, Timestamp('1970-01-02'), - Timestamp('1970-01-02'), pd.NaT, - pd.NaT, pd.NaT, pd.NaT, pd.NaT], - dtype=object) - tm.assert_index_equal(result, expected) - - result = to_datetime(values, unit='D', errors='coerce') - expected = DatetimeIndex(['NaT', '1970-01-02', '1970-01-02', - 'NaT', 'NaT', 'NaT', 'NaT', 'NaT']) - tm.assert_index_equal(result, expected) - - with self.assertRaises(tslib.OutOfBoundsDatetime): - to_datetime(values, unit='D', errors='raise') - - values = [1420043460000, tslib.iNaT, pd.NaT, np.nan, 'NaT'] - - result = to_datetime(values, errors='ignore', unit='s') - expected = Index([1420043460000, pd.NaT, pd.NaT, - pd.NaT, pd.NaT], dtype=object) - tm.assert_index_equal(result, expected) - - result = to_datetime(values, errors='coerce', unit='s') - expected = DatetimeIndex(['NaT', 'NaT', 'NaT', 'NaT', 'NaT']) - tm.assert_index_equal(result, expected) - - with self.assertRaises(tslib.OutOfBoundsDatetime): - to_datetime(values, errors='raise', unit='s') - - # if we have a string, then we raise a ValueError - # and NOT an OutOfBoundsDatetime - for val in ['foo', Timestamp('20130101')]: - try: - to_datetime(val, errors='raise', unit='s') - except tslib.OutOfBoundsDatetime: - raise AssertionError("incorrect exception raised") - except ValueError: - pass - - # consistency of conversions - expected = Timestamp('1970-05-09 14:25:11') - result = pd.to_datetime(11111111, unit='s', errors='raise') - self.assertEqual(result, expected) - self.assertIsInstance(result, Timestamp) - - result = pd.to_datetime(11111111, unit='s', errors='coerce') - self.assertEqual(result, expected) - self.assertIsInstance(result, Timestamp) - - result = pd.to_datetime(11111111, unit='s', errors='ignore') - self.assertEqual(result, expected) - self.assertIsInstance(result, Timestamp) - def test_roundtrip(self): # test value to string and back conversions @@ -4713,10 +4777,9 @@ def test_date_range_normalize(self): rng = date_range(snap, periods=n, normalize=False, freq='2D') offset = timedelta(2) - values = np.array([snap + i * offset for i in range(n)], - dtype='M8[ns]') + values = DatetimeIndex([snap + i * offset for i in range(n)]) - self.assert_numpy_array_equal(rng, values) + tm.assert_index_equal(rng, values) rng = date_range('1/1/2000 08:15', periods=n, normalize=False, freq='B') @@ -4735,7 +4798,7 @@ def test_timedelta(self): result = index - timedelta(1) expected = index + timedelta(-1) - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) # GH4134, buggy with timedeltas rng = date_range('2013', '2014') @@ -4744,8 +4807,8 @@ def test_timedelta(self): result2 = DatetimeIndex(s - np.timedelta64(100000000)) result3 = rng - np.timedelta64(100000000) result4 = DatetimeIndex(s - pd.offsets.Hour(1)) - self.assertTrue(result1.equals(result4)) - self.assertTrue(result2.equals(result3)) + tm.assert_index_equal(result1, result4) + tm.assert_index_equal(result2, result3) def test_shift(self): ts = Series(np.random.randn(5), @@ -4753,12 +4816,12 @@ def test_shift(self): result = ts.shift(1, freq='5T') exp_index = ts.index.shift(1, freq='5T') - self.assertTrue(result.index.equals(exp_index)) + tm.assert_index_equal(result.index, exp_index) # GH #1063, multiple of same base result = ts.shift(1, freq='4H') exp_index = ts.index + datetools.Hour(4) - self.assertTrue(result.index.equals(exp_index)) + tm.assert_index_equal(result.index, exp_index) idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04']) self.assertRaises(ValueError, idx.shift, 1) @@ -4910,7 +4973,7 @@ def test_to_datetime_format(self): elif isinstance(expected, Timestamp): self.assertEqual(result, expected) else: - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) def test_to_datetime_format_YYYYMMDD(self): s = Series([19801222, 19801222] + [19810105] * 5) @@ -4941,9 +5004,10 @@ def test_to_datetime_format_YYYYMMDD(self): # GH 7930 s = Series([20121231, 20141231, 99991231]) result = pd.to_datetime(s, format='%Y%m%d', errors='ignore') - expected = np.array([datetime(2012, 12, 31), datetime( - 2014, 12, 31), datetime(9999, 12, 31)], dtype=object) - self.assert_numpy_array_equal(result, expected) + expected = Series([datetime(2012, 12, 31), + datetime(2014, 12, 31), datetime(9999, 12, 31)], + dtype=object) + self.assert_series_equal(result, expected) result = pd.to_datetime(s, format='%Y%m%d', errors='coerce') expected = Series(['20121231', '20141231', 'NaT'], dtype='M8[ns]') @@ -5030,18 +5094,13 @@ def test_to_datetime_format_weeks(self): class TestToDatetimeInferFormat(tm.TestCase): def test_to_datetime_infer_datetime_format_consistent_format(self): - time_series = pd.Series(pd.date_range('20000101', periods=50, - freq='H')) + s = pd.Series(pd.date_range('20000101', periods=50, freq='H')) - test_formats = [ - '%m-%d-%Y', - '%m/%d/%Y %H:%M:%S.%f', - '%Y-%m-%dT%H:%M:%S.%f', - ] + test_formats = ['%m-%d-%Y', '%m/%d/%Y %H:%M:%S.%f', + '%Y-%m-%dT%H:%M:%S.%f'] for test_format in test_formats: - s_as_dt_strings = time_series.apply( - lambda x: x.strftime(test_format)) + s_as_dt_strings = s.apply(lambda x: x.strftime(test_format)) with_format = pd.to_datetime(s_as_dt_strings, format=test_format) no_infer = pd.to_datetime(s_as_dt_strings, @@ -5051,70 +5110,45 @@ def test_to_datetime_infer_datetime_format_consistent_format(self): # Whether the format is explicitly passed, it is inferred, or # it is not inferred, the results should all be the same - self.assert_numpy_array_equal(with_format, no_infer) - self.assert_numpy_array_equal(no_infer, yes_infer) + self.assert_series_equal(with_format, no_infer) + self.assert_series_equal(no_infer, yes_infer) def test_to_datetime_infer_datetime_format_inconsistent_format(self): - test_series = pd.Series(np.array([ - '01/01/2011 00:00:00', - '01-02-2011 00:00:00', - '2011-01-03T00:00:00', - ])) + s = pd.Series(np.array(['01/01/2011 00:00:00', + '01-02-2011 00:00:00', + '2011-01-03T00:00:00'])) # When the format is inconsistent, infer_datetime_format should just # fallback to the default parsing - self.assert_numpy_array_equal( - pd.to_datetime(test_series, infer_datetime_format=False), - pd.to_datetime(test_series, infer_datetime_format=True) - ) + tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), + pd.to_datetime(s, infer_datetime_format=True)) - test_series = pd.Series(np.array([ - 'Jan/01/2011', - 'Feb/01/2011', - 'Mar/01/2011', - ])) + s = pd.Series(np.array(['Jan/01/2011', 'Feb/01/2011', 'Mar/01/2011'])) - self.assert_numpy_array_equal( - pd.to_datetime(test_series, infer_datetime_format=False), - pd.to_datetime(test_series, infer_datetime_format=True) - ) + tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), + pd.to_datetime(s, infer_datetime_format=True)) def test_to_datetime_infer_datetime_format_series_with_nans(self): - test_series = pd.Series(np.array([ - '01/01/2011 00:00:00', - np.nan, - '01/03/2011 00:00:00', - np.nan, - ])) - - self.assert_numpy_array_equal( - pd.to_datetime(test_series, infer_datetime_format=False), - pd.to_datetime(test_series, infer_datetime_format=True) - ) + s = pd.Series(np.array(['01/01/2011 00:00:00', np.nan, + '01/03/2011 00:00:00', np.nan])) + tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), + pd.to_datetime(s, infer_datetime_format=True)) def test_to_datetime_infer_datetime_format_series_starting_with_nans(self): - test_series = pd.Series(np.array([ - np.nan, - np.nan, - '01/01/2011 00:00:00', - '01/02/2011 00:00:00', - '01/03/2011 00:00:00', - ])) + s = pd.Series(np.array([np.nan, np.nan, '01/01/2011 00:00:00', + '01/02/2011 00:00:00', '01/03/2011 00:00:00'])) - self.assert_numpy_array_equal( - pd.to_datetime(test_series, infer_datetime_format=False), - pd.to_datetime(test_series, infer_datetime_format=True) - ) + tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), + pd.to_datetime(s, infer_datetime_format=True)) def test_to_datetime_iso8601_noleading_0s(self): # GH 11871 - test_series = pd.Series(['2014-1-1', '2014-2-2', '2015-3-3']) + s = pd.Series(['2014-1-1', '2014-2-2', '2015-3-3']) expected = pd.Series([pd.Timestamp('2014-01-01'), pd.Timestamp('2014-02-02'), pd.Timestamp('2015-03-03')]) - tm.assert_series_equal(pd.to_datetime(test_series), expected) - tm.assert_series_equal(pd.to_datetime(test_series, format='%Y-%m-%d'), - expected) + tm.assert_series_equal(pd.to_datetime(s), expected) + tm.assert_series_equal(pd.to_datetime(s, format='%Y-%m-%d'), expected) class TestGuessDatetimeFormat(tm.TestCase): diff --git a/pandas/tseries/tests/test_timeseries_legacy.py b/pandas/tseries/tests/test_timeseries_legacy.py index 086f23cd2d4fd..6f58ad3a57b48 100644 --- a/pandas/tseries/tests/test_timeseries_legacy.py +++ b/pandas/tseries/tests/test_timeseries_legacy.py @@ -85,7 +85,7 @@ def test_unpickle_legacy_len0_daterange(self): ex_index = DatetimeIndex([], freq='B') - self.assertTrue(result.index.equals(ex_index)) + self.assert_index_equal(result.index, ex_index) tm.assertIsInstance(result.index.freq, offsets.BDay) self.assertEqual(len(result), 0) @@ -116,7 +116,7 @@ def _check_join(left, right, how='inner'): return_indexers=True) tm.assertIsInstance(ra, DatetimeIndex) - self.assertTrue(ra.equals(ea)) + self.assert_index_equal(ra, ea) assert_almost_equal(rb, eb) assert_almost_equal(rc, ec) @@ -150,24 +150,24 @@ def test_setops(self): result = index[:5].union(obj_index[5:]) expected = index tm.assertIsInstance(result, DatetimeIndex) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) result = index[:10].intersection(obj_index[5:]) expected = index[5:10] tm.assertIsInstance(result, DatetimeIndex) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) result = index[:10] - obj_index[5:] expected = index[:5] tm.assertIsInstance(result, DatetimeIndex) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) def test_index_conversion(self): index = self.frame.index obj_index = index.asobject conv = DatetimeIndex(obj_index) - self.assertTrue(conv.equals(index)) + self.assert_index_equal(conv, index) self.assertRaises(ValueError, DatetimeIndex, ['a', 'b', 'c', 'd']) @@ -188,11 +188,11 @@ def test_setops_conversion_fail(self): result = index.union(right) expected = Index(np.concatenate([index.asobject, right])) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) result = index.intersection(right) expected = Index([]) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) def test_legacy_time_rules(self): rules = [('WEEKDAY', 'B'), ('EOM', 'BM'), ('W@MON', 'W-MON'), @@ -211,7 +211,7 @@ def test_legacy_time_rules(self): for old_freq, new_freq in rules: old_rng = date_range(start, end, freq=old_freq) new_rng = date_range(start, end, freq=new_freq) - self.assertTrue(old_rng.equals(new_rng)) + self.assert_index_equal(old_rng, new_rng) # test get_legacy_offset_name offset = datetools.get_offset(new_freq) diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py index 1f0632377c851..afe9d0652db19 100644 --- a/pandas/tseries/tests/test_timezones.py +++ b/pandas/tseries/tests/test_timezones.py @@ -263,7 +263,7 @@ def test_create_with_fixed_tz(self): self.assertEqual(off, rng.tz) rng2 = date_range(start, periods=len(rng), tz=off) - self.assertTrue(rng.equals(rng2)) + self.assert_index_equal(rng, rng2) rng3 = date_range('3/11/2012 05:00:00+07:00', '6/11/2012 05:00:00+07:00') @@ -287,7 +287,7 @@ def test_date_range_localize(self): rng3 = date_range('3/11/2012 03:00', periods=15, freq='H') rng3 = rng3.tz_localize('US/Eastern') - self.assertTrue(rng.equals(rng3)) + self.assert_index_equal(rng, rng3) # DST transition time val = rng[0] @@ -296,14 +296,14 @@ def test_date_range_localize(self): self.assertEqual(val.hour, 3) self.assertEqual(exp.hour, 3) self.assertEqual(val, exp) # same UTC value - self.assertTrue(rng[:2].equals(rng2)) + self.assert_index_equal(rng[:2], rng2) # Right before the DST transition rng = date_range('3/11/2012 00:00', periods=2, freq='H', tz='US/Eastern') rng2 = DatetimeIndex(['3/11/2012 00:00', '3/11/2012 01:00'], tz='US/Eastern') - self.assertTrue(rng.equals(rng2)) + self.assert_index_equal(rng, rng2) exp = Timestamp('3/11/2012 00:00', tz='US/Eastern') self.assertEqual(exp.hour, 0) self.assertEqual(rng[0], exp) @@ -402,7 +402,7 @@ def test_tz_localize(self): dr = bdate_range('1/1/2009', '1/1/2010') dr_utc = bdate_range('1/1/2009', '1/1/2010', tz=pytz.utc) localized = dr.tz_localize(pytz.utc) - self.assert_numpy_array_equal(dr_utc, localized) + self.assert_index_equal(dr_utc, localized) def test_with_tz_ambiguous_times(self): tz = self.tz('US/Eastern') @@ -440,22 +440,22 @@ def test_ambiguous_infer(self): '11/06/2011 02:00', '11/06/2011 03:00'] di = DatetimeIndex(times) localized = di.tz_localize(tz, ambiguous='infer') - self.assert_numpy_array_equal(dr, localized) + self.assert_index_equal(dr, localized) with tm.assert_produces_warning(FutureWarning): localized_old = di.tz_localize(tz, infer_dst=True) - self.assert_numpy_array_equal(dr, localized_old) - self.assert_numpy_array_equal(dr, DatetimeIndex(times, tz=tz, - ambiguous='infer')) + self.assert_index_equal(dr, localized_old) + self.assert_index_equal(dr, DatetimeIndex(times, tz=tz, + ambiguous='infer')) # When there is no dst transition, nothing special happens dr = date_range(datetime(2011, 6, 1, 0), periods=10, freq=datetools.Hour()) localized = dr.tz_localize(tz) localized_infer = dr.tz_localize(tz, ambiguous='infer') - self.assert_numpy_array_equal(localized, localized_infer) + self.assert_index_equal(localized, localized_infer) with tm.assert_produces_warning(FutureWarning): localized_infer_old = dr.tz_localize(tz, infer_dst=True) - self.assert_numpy_array_equal(localized, localized_infer_old) + self.assert_index_equal(localized, localized_infer_old) def test_ambiguous_flags(self): # November 6, 2011, fall back, repeat 2 AM hour @@ -471,20 +471,20 @@ def test_ambiguous_flags(self): di = DatetimeIndex(times) is_dst = [1, 1, 0, 0, 0] localized = di.tz_localize(tz, ambiguous=is_dst) - self.assert_numpy_array_equal(dr, localized) - self.assert_numpy_array_equal(dr, DatetimeIndex(times, tz=tz, - ambiguous=is_dst)) + self.assert_index_equal(dr, localized) + self.assert_index_equal(dr, DatetimeIndex(times, tz=tz, + ambiguous=is_dst)) localized = di.tz_localize(tz, ambiguous=np.array(is_dst)) - self.assert_numpy_array_equal(dr, localized) + self.assert_index_equal(dr, localized) localized = di.tz_localize(tz, ambiguous=np.array(is_dst).astype('bool')) - self.assert_numpy_array_equal(dr, localized) + self.assert_index_equal(dr, localized) # Test constructor localized = DatetimeIndex(times, tz=tz, ambiguous=is_dst) - self.assert_numpy_array_equal(dr, localized) + self.assert_index_equal(dr, localized) # Test duplicate times where infer_dst fails times += times @@ -497,7 +497,7 @@ def test_ambiguous_flags(self): is_dst = np.hstack((is_dst, is_dst)) localized = di.tz_localize(tz, ambiguous=is_dst) dr = dr.append(dr) - self.assert_numpy_array_equal(dr, localized) + self.assert_index_equal(dr, localized) # When there is no dst transition, nothing special happens dr = date_range(datetime(2011, 6, 1, 0), periods=10, @@ -505,7 +505,7 @@ def test_ambiguous_flags(self): is_dst = np.array([1] * 10) localized = dr.tz_localize(tz) localized_is_dst = dr.tz_localize(tz, ambiguous=is_dst) - self.assert_numpy_array_equal(localized, localized_is_dst) + self.assert_index_equal(localized, localized_is_dst) # construction with an ambiguous end-point # GH 11626 @@ -531,7 +531,10 @@ def test_ambiguous_nat(self): times = ['11/06/2011 00:00', np.NaN, np.NaN, '11/06/2011 02:00', '11/06/2011 03:00'] di_test = DatetimeIndex(times, tz='US/Eastern') - self.assert_numpy_array_equal(di_test, localized) + + # left dtype is datetime64[ns, US/Eastern] + # right is datetime64[ns, tzfile('/usr/share/zoneinfo/US/Eastern')] + self.assert_numpy_array_equal(di_test.values, localized.values) def test_nonexistent_raise_coerce(self): # See issue 13057 @@ -580,7 +583,7 @@ def test_tz_string(self): tz=self.tzstr('US/Eastern')) expected = date_range('1/1/2000', periods=10, tz=self.tz('US/Eastern')) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) def test_take_dont_lose_meta(self): tm._skip_if_no_pytz() @@ -673,7 +676,7 @@ def test_convert_tz_aware_datetime_datetime(self): self.assertTrue(self.cmptz(result.tz, self.tz('US/Eastern'))) converted = to_datetime(dates_aware, utc=True) - ex_vals = [Timestamp(x).value for x in dates_aware] + ex_vals = np.array([Timestamp(x).value for x in dates_aware]) self.assert_numpy_array_equal(converted.asi8, ex_vals) self.assertIs(converted.tz, pytz.utc) @@ -779,10 +782,11 @@ def test_date_range_span_dst_transition(self): self.assertTrue((dr.hour == 0).all()) def test_convert_datetime_list(self): - dr = date_range('2012-06-02', periods=10, tz=self.tzstr('US/Eastern')) + dr = date_range('2012-06-02', periods=10, + tz=self.tzstr('US/Eastern'), name='foo') dr2 = DatetimeIndex(list(dr), name='foo') - self.assertTrue(dr.equals(dr2)) + self.assert_index_equal(dr, dr2) self.assertEqual(dr.tz, dr2.tz) self.assertEqual(dr2.name, 'foo') @@ -845,7 +849,7 @@ def test_datetimeindex_tz(self): idx4 = DatetimeIndex(np.array(arr), tz=self.tzstr('US/Eastern')) for other in [idx2, idx3, idx4]: - self.assertTrue(idx1.equals(other)) + self.assert_index_equal(idx1, other) def test_datetimeindex_tz_nat(self): idx = to_datetime([Timestamp("2013-1-1", tz=self.tzstr('US/Eastern')), @@ -898,6 +902,88 @@ def test_utc_with_system_utc(self): # check that the time hasn't changed. self.assertEqual(ts, ts.tz_convert(dateutil.tz.tzutc())) + def test_tz_convert_hour_overflow_dst(self): + # Regression test for: + # https://github.com/pydata/pandas/issues/13306 + + # sorted case US/Eastern -> UTC + ts = ['2008-05-12 09:50:00', + '2008-12-12 09:50:35', + '2009-05-12 09:50:32'] + tt = to_datetime(ts).tz_localize('US/Eastern') + ut = tt.tz_convert('UTC') + expected = np.array([13, 14, 13], dtype=np.int32) + self.assert_numpy_array_equal(ut.hour, expected) + + # sorted case UTC -> US/Eastern + ts = ['2008-05-12 13:50:00', + '2008-12-12 14:50:35', + '2009-05-12 13:50:32'] + tt = to_datetime(ts).tz_localize('UTC') + ut = tt.tz_convert('US/Eastern') + expected = np.array([9, 9, 9], dtype=np.int32) + self.assert_numpy_array_equal(ut.hour, expected) + + # unsorted case US/Eastern -> UTC + ts = ['2008-05-12 09:50:00', + '2008-12-12 09:50:35', + '2008-05-12 09:50:32'] + tt = to_datetime(ts).tz_localize('US/Eastern') + ut = tt.tz_convert('UTC') + expected = np.array([13, 14, 13], dtype=np.int32) + self.assert_numpy_array_equal(ut.hour, expected) + + # unsorted case UTC -> US/Eastern + ts = ['2008-05-12 13:50:00', + '2008-12-12 14:50:35', + '2008-05-12 13:50:32'] + tt = to_datetime(ts).tz_localize('UTC') + ut = tt.tz_convert('US/Eastern') + expected = np.array([9, 9, 9], dtype=np.int32) + self.assert_numpy_array_equal(ut.hour, expected) + + def test_tz_convert_hour_overflow_dst_timestamps(self): + # Regression test for: + # https://github.com/pydata/pandas/issues/13306 + + tz = self.tzstr('US/Eastern') + + # sorted case US/Eastern -> UTC + ts = [Timestamp('2008-05-12 09:50:00', tz=tz), + Timestamp('2008-12-12 09:50:35', tz=tz), + Timestamp('2009-05-12 09:50:32', tz=tz)] + tt = to_datetime(ts) + ut = tt.tz_convert('UTC') + expected = np.array([13, 14, 13], dtype=np.int32) + self.assert_numpy_array_equal(ut.hour, expected) + + # sorted case UTC -> US/Eastern + ts = [Timestamp('2008-05-12 13:50:00', tz='UTC'), + Timestamp('2008-12-12 14:50:35', tz='UTC'), + Timestamp('2009-05-12 13:50:32', tz='UTC')] + tt = to_datetime(ts) + ut = tt.tz_convert('US/Eastern') + expected = np.array([9, 9, 9], dtype=np.int32) + self.assert_numpy_array_equal(ut.hour, expected) + + # unsorted case US/Eastern -> UTC + ts = [Timestamp('2008-05-12 09:50:00', tz=tz), + Timestamp('2008-12-12 09:50:35', tz=tz), + Timestamp('2008-05-12 09:50:32', tz=tz)] + tt = to_datetime(ts) + ut = tt.tz_convert('UTC') + expected = np.array([13, 14, 13], dtype=np.int32) + self.assert_numpy_array_equal(ut.hour, expected) + + # unsorted case UTC -> US/Eastern + ts = [Timestamp('2008-05-12 13:50:00', tz='UTC'), + Timestamp('2008-12-12 14:50:35', tz='UTC'), + Timestamp('2008-05-12 13:50:32', tz='UTC')] + tt = to_datetime(ts) + ut = tt.tz_convert('US/Eastern') + expected = np.array([9, 9, 9], dtype=np.int32) + self.assert_numpy_array_equal(ut.hour, expected) + def test_tslib_tz_convert_trans_pos_plus_1__bug(self): # Regression test for tslib.tz_convert(vals, tz1, tz2). # See https://github.com/pydata/pandas/issues/4496 for details. @@ -1011,7 +1097,7 @@ def test_tz_localize_naive(self): conv = rng.tz_localize('US/Pacific') exp = date_range('1/1/2011', periods=100, freq='H', tz='US/Pacific') - self.assertTrue(conv.equals(exp)) + self.assert_index_equal(conv, exp) def test_tz_localize_roundtrip(self): for tz in self.timezones: @@ -1143,7 +1229,7 @@ def test_join_aware(self): result = test1.join(test2, how='outer') ex_index = test1.index.union(test2.index) - self.assertTrue(result.index.equals(ex_index)) + self.assert_index_equal(result.index, ex_index) self.assertTrue(result.index.tz.zone == 'US/Central') # non-overlapping @@ -1199,11 +1285,11 @@ def test_append_aware_naive(self): ts1 = Series(np.random.randn(len(rng1)), index=rng1) ts2 = Series(np.random.randn(len(rng2)), index=rng2) ts_result = ts1.append(ts2) + self.assertTrue(ts_result.index.equals(ts1.index.asobject.append( ts2.index.asobject))) # mixed - rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') rng2 = lrange(100) ts1 = Series(np.random.randn(len(rng1)), index=rng1) @@ -1280,7 +1366,7 @@ def test_datetimeindex_tz(self): rng = date_range('03/12/2012 00:00', periods=10, freq='W-FRI', tz='US/Eastern') rng2 = DatetimeIndex(data=rng, tz='US/Eastern') - self.assertTrue(rng.equals(rng2)) + self.assert_index_equal(rng, rng2) def test_normalize_tz(self): rng = date_range('1/1/2000 9:30', periods=10, freq='D', @@ -1289,7 +1375,7 @@ def test_normalize_tz(self): result = rng.normalize() expected = date_range('1/1/2000', periods=10, freq='D', tz='US/Eastern') - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertTrue(result.is_normalized) self.assertFalse(rng.is_normalized) @@ -1298,7 +1384,7 @@ def test_normalize_tz(self): result = rng.normalize() expected = date_range('1/1/2000', periods=10, freq='D', tz='UTC') - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertTrue(result.is_normalized) self.assertFalse(rng.is_normalized) @@ -1307,7 +1393,7 @@ def test_normalize_tz(self): rng = date_range('1/1/2000 9:30', periods=10, freq='D', tz=tzlocal()) result = rng.normalize() expected = date_range('1/1/2000', periods=10, freq='D', tz=tzlocal()) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertTrue(result.is_normalized) self.assertFalse(rng.is_normalized) @@ -1324,45 +1410,45 @@ def test_tzaware_offset(self): '2010-11-01 07:00'], freq='H', tz=tz) offset = dates + offsets.Hour(5) - self.assertTrue(offset.equals(expected)) + self.assert_index_equal(offset, expected) offset = dates + np.timedelta64(5, 'h') - self.assertTrue(offset.equals(expected)) + self.assert_index_equal(offset, expected) offset = dates + timedelta(hours=5) - self.assertTrue(offset.equals(expected)) + self.assert_index_equal(offset, expected) def test_nat(self): # GH 5546 dates = [NaT] idx = DatetimeIndex(dates) idx = idx.tz_localize('US/Pacific') - self.assertTrue(idx.equals(DatetimeIndex(dates, tz='US/Pacific'))) + self.assert_index_equal(idx, DatetimeIndex(dates, tz='US/Pacific')) idx = idx.tz_convert('US/Eastern') - self.assertTrue(idx.equals(DatetimeIndex(dates, tz='US/Eastern'))) + self.assert_index_equal(idx, DatetimeIndex(dates, tz='US/Eastern')) idx = idx.tz_convert('UTC') - self.assertTrue(idx.equals(DatetimeIndex(dates, tz='UTC'))) + self.assert_index_equal(idx, DatetimeIndex(dates, tz='UTC')) dates = ['2010-12-01 00:00', '2010-12-02 00:00', NaT] idx = DatetimeIndex(dates) idx = idx.tz_localize('US/Pacific') - self.assertTrue(idx.equals(DatetimeIndex(dates, tz='US/Pacific'))) + self.assert_index_equal(idx, DatetimeIndex(dates, tz='US/Pacific')) idx = idx.tz_convert('US/Eastern') expected = ['2010-12-01 03:00', '2010-12-02 03:00', NaT] - self.assertTrue(idx.equals(DatetimeIndex(expected, tz='US/Eastern'))) + self.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern')) idx = idx + offsets.Hour(5) expected = ['2010-12-01 08:00', '2010-12-02 08:00', NaT] - self.assertTrue(idx.equals(DatetimeIndex(expected, tz='US/Eastern'))) + self.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern')) idx = idx.tz_convert('US/Pacific') expected = ['2010-12-01 05:00', '2010-12-02 05:00', NaT] - self.assertTrue(idx.equals(DatetimeIndex(expected, tz='US/Pacific'))) + self.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Pacific')) idx = idx + np.timedelta64(3, 'h') expected = ['2010-12-01 08:00', '2010-12-02 08:00', NaT] - self.assertTrue(idx.equals(DatetimeIndex(expected, tz='US/Pacific'))) + self.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Pacific')) idx = idx.tz_convert('US/Eastern') expected = ['2010-12-01 11:00', '2010-12-02 11:00', NaT] - self.assertTrue(idx.equals(DatetimeIndex(expected, tz='US/Eastern'))) + self.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern')) if __name__ == '__main__': diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index 4543047a8a72a..c6436163b9edb 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -2,7 +2,7 @@ from distutils.version import LooseVersion import numpy as np -from pandas import tslib +from pandas import tslib, lib import pandas._period as period import datetime @@ -25,6 +25,35 @@ from pandas.util.testing import assert_series_equal, _skip_if_has_locale +class TestTsUtil(tm.TestCase): + + def test_try_parse_dates(self): + from dateutil.parser import parse + arr = np.array(['5/1/2000', '6/1/2000', '7/1/2000'], dtype=object) + + result = lib.try_parse_dates(arr, dayfirst=True) + expected = [parse(d, dayfirst=True) for d in arr] + self.assertTrue(np.array_equal(result, expected)) + + def test_min_valid(self): + # Ensure that Timestamp.min is a valid Timestamp + Timestamp(Timestamp.min) + + def test_max_valid(self): + # Ensure that Timestamp.max is a valid Timestamp + Timestamp(Timestamp.max) + + def test_to_datetime_bijective(self): + # Ensure that converting to datetime and back only loses precision + # by going from nanoseconds to microseconds. + self.assertEqual( + Timestamp(Timestamp.max.to_pydatetime()).value / 1000, + Timestamp.max.value / 1000) + self.assertEqual( + Timestamp(Timestamp.min.to_pydatetime()).value / 1000, + Timestamp.min.value / 1000) + + class TestTimestamp(tm.TestCase): def test_constructor(self): @@ -180,6 +209,52 @@ def test_constructor_invalid(self): with tm.assertRaisesRegexp(ValueError, 'Cannot convert Period'): Timestamp(Period('1000-01-01')) + def test_constructor_positional(self): + # GH 10758 + with tm.assertRaises(TypeError): + Timestamp(2000, 1) + with tm.assertRaises(ValueError): + Timestamp(2000, 0, 1) + with tm.assertRaises(ValueError): + Timestamp(2000, 13, 1) + with tm.assertRaises(ValueError): + Timestamp(2000, 1, 0) + with tm.assertRaises(ValueError): + Timestamp(2000, 1, 32) + + # GH 11630 + self.assertEqual( + repr(Timestamp(2015, 11, 12)), + repr(Timestamp('20151112'))) + + self.assertEqual( + repr(Timestamp(2015, 11, 12, 1, 2, 3, 999999)), + repr(Timestamp('2015-11-12 01:02:03.999999'))) + + self.assertIs(Timestamp(None), pd.NaT) + + def test_constructor_keyword(self): + # GH 10758 + with tm.assertRaises(TypeError): + Timestamp(year=2000, month=1) + with tm.assertRaises(ValueError): + Timestamp(year=2000, month=0, day=1) + with tm.assertRaises(ValueError): + Timestamp(year=2000, month=13, day=1) + with tm.assertRaises(ValueError): + Timestamp(year=2000, month=1, day=0) + with tm.assertRaises(ValueError): + Timestamp(year=2000, month=1, day=32) + + self.assertEqual( + repr(Timestamp(year=2015, month=11, day=12)), + repr(Timestamp('20151112'))) + + self.assertEqual( + repr(Timestamp(year=2015, month=11, day=12, + hour=1, minute=2, second=3, microsecond=999999)), + repr(Timestamp('2015-11-12 01:02:03.999999'))) + def test_conversion(self): # GH 9255 ts = Timestamp('2000-01-01') @@ -766,8 +841,9 @@ def test_parsers_time(self): self.assert_series_equal(tools.to_time(Series(arg, name="test")), Series(expected_arr, name="test")) - self.assert_numpy_array_equal(tools.to_time(np.array(arg)), - np.array(expected_arr, dtype=np.object_)) + res = tools.to_time(np.array(arg)) + self.assertIsInstance(res, list) + self.assert_equal(res, expected_arr) def test_parsers_monthfreq(self): cases = {'201101': datetime.datetime(2011, 1, 1, 0, 0), @@ -1290,6 +1366,24 @@ def test_shift_months(self): years=years, months=months) for x in s]) tm.assert_index_equal(actual, expected) + def test_round(self): + stamp = Timestamp('2000-01-05 05:09:15.13') + + def _check_round(freq, expected): + result = stamp.round(freq=freq) + self.assertEqual(result, expected) + + for freq, expected in [ + ('D', Timestamp('2000-01-05 00:00:00')), + ('H', Timestamp('2000-01-05 05:00:00')), + ('S', Timestamp('2000-01-05 05:09:15')) + ]: + _check_round(freq, expected) + + msg = "Could not evaluate" + tm.assertRaisesRegexp(ValueError, msg, + stamp.round, 'foo') + class TestTimestampOps(tm.TestCase): def test_timestamp_and_datetime(self): diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index a46149035dbae..d5e87d1df2462 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -221,7 +221,8 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, - If True, require an exact format match. - If False, allow the format to match anywhere in the target string. - unit : unit of the arg (D,s,ms,us,ns) denote the unit in epoch + unit : string, default 'ns' + unit of the arg (D,s,ms,us,ns) denote the unit in epoch (e.g. a unix timestamp), which is an integer/float number. infer_datetime_format : boolean, default False If True and no `format` is given, attempt to infer the format of the diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index a240558025090..6453e65ecdc81 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -214,8 +214,8 @@ cdef inline bint _is_fixed_offset(object tz): return 0 return 1 - _zero_time = datetime_time(0, 0) +_no_input = object() # Python front end to C extension type _Timestamp # This serves as the box for datetime64 @@ -225,6 +225,10 @@ class Timestamp(_Timestamp): for the entries that make up a DatetimeIndex, and other timeseries oriented data structures in pandas. + There are essentially three calling conventions for the constructor. The + primary form accepts four parameters. They can be passed by position or + keyword. + Parameters ---------- ts_input : datetime-like, str, int, float @@ -235,6 +239,23 @@ class Timestamp(_Timestamp): Time zone for time which Timestamp will have. unit : string numpy unit used for conversion, if ts_input is int or float + + The other two forms mimic the parameters from ``datetime.datetime``. They + can be passed by either position or keyword, but not both mixed together. + + :func:`datetime.datetime` Parameters + ------------------------------------ + + .. versionadded:: 0.18.2 + + year : int + month : int + day : int + hour : int, optional, default is 0 + minute : int, optional, default is 0 + second : int, optional, default is 0 + microsecond : int, optional, default is 0 + tzinfo : datetime.tzinfo, optional, default is None """ @classmethod @@ -288,10 +309,46 @@ class Timestamp(_Timestamp): def combine(cls, date, time): return cls(datetime.combine(date, time)) - def __new__(cls, object ts_input, object offset=None, tz=None, unit=None): + def __new__(cls, + object ts_input=_no_input, object offset=None, tz=None, unit=None, + year=None, month=None, day=None, + hour=None, minute=None, second=None, microsecond=None, + tzinfo=None): + # The parameter list folds together legacy parameter names (the first + # four) and positional and keyword parameter names from pydatetime. + # + # There are three calling forms: + # + # - In the legacy form, the first parameter, ts_input, is required + # and may be datetime-like, str, int, or float. The second + # parameter, offset, is optional and may be str or DateOffset. + # + # - ints in the first, second, and third arguments indicate + # pydatetime positional arguments. Only the first 8 arguments + # (standing in for year, month, day, hour, minute, second, + # microsecond, tzinfo) may be non-None. As a shortcut, we just + # check that the second argument is an int. + # + # - Nones for the first four (legacy) arguments indicate pydatetime + # keyword arguments. year, month, and day are required. As a + # shortcut, we just check that the first argument was not passed. + # + # Mixing pydatetime positional and keyword arguments is forbidden! + cdef _TSObject ts cdef _Timestamp ts_base + if ts_input is _no_input: + # User passed keyword arguments. + return Timestamp(datetime(year, month, day, hour or 0, + minute or 0, second or 0, microsecond or 0, tzinfo), + tz=tzinfo) + elif is_integer_object(offset): + # User passed positional arguments: + # Timestamp(year, month, day[, hour[, minute[, second[, microsecond[, tzinfo]]]]]) + return Timestamp(datetime(ts_input, offset, tz, unit or 0, + year or 0, month or 0, day or 0, hour), tz=hour) + ts = convert_to_tsobject(ts_input, tz, unit, 0, 0) if ts.value == NPY_NAT: @@ -2082,6 +2139,7 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): unit)) elif is_ignore: raise AssertionError + iresult[i] = NPY_NAT except: if is_raise: raise OutOfBoundsDatetime("cannot convert input {0}" @@ -2149,7 +2207,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', ndarray[int64_t] iresult ndarray[object] oresult pandas_datetimestruct dts - bint utc_convert = bool(utc), seen_integer=0, seen_datetime=0 + bint utc_convert = bool(utc), seen_integer=0, seen_string=0, seen_datetime=0 bint is_raise=errors=='raise', is_ignore=errors=='ignore', is_coerce=errors=='coerce' _TSObject _ts int out_local=0, out_tzoffset=0 @@ -2162,8 +2220,10 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', iresult = result.view('i8') for i in range(n): val = values[i] + if _checknull_with_nat(val): iresult[i] = NPY_NAT + elif PyDateTime_Check(val): seen_datetime=1 if val.tzinfo is not None: @@ -2192,6 +2252,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', iresult[i] = NPY_NAT continue raise + elif PyDate_Check(val): iresult[i] = _date_to_datetime64(val, &dts) try: @@ -2202,6 +2263,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', iresult[i] = NPY_NAT continue raise + elif util.is_datetime64_object(val): if get_datetime64_value(val) == NPY_NAT: iresult[i] = NPY_NAT @@ -2215,25 +2277,35 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', continue raise - # if we are coercing, dont' allow integers - elif is_integer_object(val) and not is_coerce: - if val == NPY_NAT: + elif is_integer_object(val) or is_float_object(val): + # these must be ns unit by-definition + + if val != val or val == NPY_NAT: iresult[i] = NPY_NAT - else: + elif is_raise or is_ignore: iresult[i] = val seen_integer=1 - elif is_float_object(val) and not is_coerce: - if val != val or val == NPY_NAT: - iresult[i] = NPY_NAT else: - iresult[i] = val - seen_integer=1 - else: + # coerce + # we now need to parse this as if unit='ns' + # we can ONLY accept integers at this point + # if we have previously (or in future accept + # datetimes/strings, then we must coerce) + seen_integer = 1 + try: + iresult[i] = cast_from_unit(val, 'ns') + except: + iresult[i] = NPY_NAT + + elif util.is_string_object(val): + # string + try: if len(val) == 0 or val in _nat_strings: iresult[i] = NPY_NAT continue + seen_string=1 _string_to_dts(val, &dts, &out_local, &out_tzoffset) value = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) if out_local == 1: @@ -2275,12 +2347,27 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', iresult[i] = NPY_NAT continue raise + else: + if is_coerce: + iresult[i] = NPY_NAT + else: + raise TypeError("{0} is not convertible to datetime" + .format(type(val))) + + if seen_datetime and seen_integer: + # we have mixed datetimes & integers - # don't allow mixed integers and datetime like - # higher levels can catch and is_coerce to object, for - # example - if seen_integer and seen_datetime: - raise ValueError("mixed datetimes and integers in passed array") + if is_coerce: + # coerce all of the integers/floats to NaT, preserve + # the datetimes and other convertibles + for i in range(n): + val = values[i] + if is_integer_object(val) or is_float_object(val): + result[i] = NPY_NAT + elif is_raise: + raise ValueError("mixed datetimes and integers in passed array") + else: + raise TypeError return result except OutOfBoundsDatetime: @@ -3667,8 +3754,8 @@ except: def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): cdef: - ndarray[int64_t] utc_dates, tt, result, trans, deltas - Py_ssize_t i, pos, n = len(vals) + ndarray[int64_t] utc_dates, tt, result, trans, deltas, posn + Py_ssize_t i, j, pos, n = len(vals) int64_t v, offset pandas_datetimestruct dts Py_ssize_t trans_len @@ -3704,19 +3791,18 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): return vals trans_len = len(trans) - pos = trans.searchsorted(tt[0]) - 1 - if pos < 0: - raise ValueError('First time before start of DST info') - - offset = deltas[pos] + posn = trans.searchsorted(tt, side='right') + j = 0 for i in range(n): v = vals[i] if v == NPY_NAT: utc_dates[i] = NPY_NAT else: - while pos + 1 < trans_len and v >= trans[pos + 1]: - pos += 1 - offset = deltas[pos] + pos = posn[j] - 1 + j = j + 1 + if pos < 0: + raise ValueError('First time before start of DST info') + offset = deltas[pos] utc_dates[i] = v - offset else: utc_dates = vals @@ -3751,20 +3837,18 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): if (result==NPY_NAT).all(): return result - pos = trans.searchsorted(utc_dates[utc_dates!=NPY_NAT][0]) - 1 - if pos < 0: - raise ValueError('First time before start of DST info') - - # TODO: this assumed sortedness :/ - offset = deltas[pos] + posn = trans.searchsorted(utc_dates[utc_dates!=NPY_NAT], side='right') + j = 0 for i in range(n): v = utc_dates[i] if vals[i] == NPY_NAT: result[i] = vals[i] else: - while pos + 1 < trans_len and v >= trans[pos + 1]: - pos += 1 - offset = deltas[pos] + pos = posn[j] - 1 + j = j + 1 + if pos < 0: + raise ValueError('First time before start of DST info') + offset = deltas[pos] result[i] = v + offset return result diff --git a/pandas/types/api.py b/pandas/types/api.py index bb61025a41a37..721d8d29bba8b 100644 --- a/pandas/types/api.py +++ b/pandas/types/api.py @@ -28,7 +28,11 @@ def pandas_dtype(dtype): ------- np.dtype or a pandas dtype """ - if isinstance(dtype, string_types): + if isinstance(dtype, DatetimeTZDtype): + return dtype + elif isinstance(dtype, CategoricalDtype): + return dtype + elif isinstance(dtype, string_types): try: return DatetimeTZDtype.construct_from_string(dtype) except TypeError: @@ -40,3 +44,32 @@ def pandas_dtype(dtype): pass return np.dtype(dtype) + +def na_value_for_dtype(dtype): + """ + Return a dtype compat na value + + Parameters + ---------- + dtype : string / dtype + + Returns + ------- + dtype compat na value + """ + + from pandas.core import common as com + from pandas import NaT + dtype = pandas_dtype(dtype) + + if (com.is_datetime64_dtype(dtype) or + com.is_datetime64tz_dtype(dtype) or + com.is_timedelta64_dtype(dtype)): + return NaT + elif com.is_float_dtype(dtype): + return np.nan + elif com.is_integer_dtype(dtype): + return 0 + elif com.is_bool_dtype(dtype): + return False + return np.nan diff --git a/pandas/types/concat.py b/pandas/types/concat.py index eb18023d6409d..5cd7abb6889b7 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -249,7 +249,7 @@ def convert_to_pydatetime(x, axis): # thus no need to care # we require ALL of the same tz for datetimetz - tzs = set([x.tz for x in to_concat]) + tzs = set([str(x.tz) for x in to_concat]) if len(tzs) == 1: from pandas.tseries.index import DatetimeIndex new_values = np.concatenate([x.tz_localize(None).asi8 diff --git a/pandas/types/dtypes.py b/pandas/types/dtypes.py index e6adbc8500117..140d494c3e1b2 100644 --- a/pandas/types/dtypes.py +++ b/pandas/types/dtypes.py @@ -108,6 +108,16 @@ class CategoricalDtype(ExtensionDtype): kind = 'O' str = '|O08' base = np.dtype('O') + _cache = {} + + def __new__(cls): + + try: + return cls._cache[cls.name] + except KeyError: + c = object.__new__(cls) + cls._cache[cls.name] = c + return c def __hash__(self): # make myself hashable @@ -155,9 +165,11 @@ class DatetimeTZDtype(ExtensionDtype): base = np.dtype('M8[ns]') _metadata = ['unit', 'tz'] _match = re.compile("(datetime64|M8)\[(?P.+), (?P.+)\]") + _cache = {} + + def __new__(cls, unit=None, tz=None): + """ Create a new unit if needed, otherwise return from the cache - def __init__(self, unit, tz=None): - """ Parameters ---------- unit : string unit that this represents, currently must be 'ns' @@ -165,28 +177,46 @@ def __init__(self, unit, tz=None): """ if isinstance(unit, DatetimeTZDtype): - self.unit, self.tz = unit.unit, unit.tz - return + unit, tz = unit.unit, unit.tz - if tz is None: + elif unit is None: + # we are called as an empty constructor + # generally for pickle compat + return object.__new__(cls) + + elif tz is None: # we were passed a string that we can construct try: - m = self._match.search(unit) + m = cls._match.search(unit) if m is not None: - self.unit = m.groupdict()['unit'] - self.tz = m.groupdict()['tz'] - return + unit = m.groupdict()['unit'] + tz = m.groupdict()['tz'] except: raise ValueError("could not construct DatetimeTZDtype") + elif isinstance(unit, compat.string_types): + + if unit != 'ns': + raise ValueError("DatetimeTZDtype only supports ns units") + + unit = unit + tz = tz + + if tz is None: raise ValueError("DatetimeTZDtype constructor must have a tz " "supplied") - if unit != 'ns': - raise ValueError("DatetimeTZDtype only supports ns units") - self.unit = unit - self.tz = tz + # set/retrieve from cache + key = (unit, str(tz)) + try: + return cls._cache[key] + except KeyError: + u = object.__new__(cls) + u.unit = unit + u.tz = tz + cls._cache[key] = u + return u @classmethod def construct_from_string(cls, string): diff --git a/pandas/util/print_versions.py b/pandas/util/print_versions.py index 115423f3e3e22..e74568f39418c 100644 --- a/pandas/util/print_versions.py +++ b/pandas/util/print_versions.py @@ -4,6 +4,7 @@ import struct import subprocess import codecs +import importlib def get_sys_info(): @@ -55,7 +56,6 @@ def get_sys_info(): def show_versions(as_json=False): - import imp sys_info = get_sys_info() deps = [ @@ -99,11 +99,7 @@ def show_versions(as_json=False): deps_blob = list() for (modname, ver_f) in deps: try: - try: - mod = imp.load_module(modname, *imp.find_module(modname)) - except (ImportError): - import importlib - mod = importlib.import_module(modname) + mod = importlib.import_module(modname) ver = ver_f(mod) deps_blob.append((modname, ver)) except: diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 3ea4a09c453ee..03ccfcab24f58 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -19,18 +19,19 @@ from distutils.version import LooseVersion from numpy.random import randn, rand +from numpy.testing.decorators import slow # noqa import numpy as np import pandas as pd from pandas.core.common import (is_sequence, array_equivalent, is_list_like, is_datetimelike_v_numeric, - is_datetimelike_v_object, is_number, - needs_i8_conversion) + is_datetimelike_v_object, + is_number, is_bool, + needs_i8_conversion, is_categorical_dtype) from pandas.formats.printing import pprint_thing from pandas.core.algorithms import take_1d import pandas.compat as compat -import pandas.lib as lib from pandas.compat import( filter, map, zip, range, unichr, lrange, lmap, lzip, u, callable, Counter, raise_with_traceback, httplib, is_platform_windows, is_platform_32bit, @@ -115,23 +116,71 @@ def assertNotAlmostEquals(self, *args, **kwargs): self.assertNotAlmostEqual)(*args, **kwargs) -def assert_almost_equal(left, right, check_exact=False, **kwargs): +def assert_almost_equal(left, right, check_exact=False, + check_dtype='equiv', check_less_precise=False, + **kwargs): + """Check that left and right Index are equal. + + Parameters + ---------- + left : object + right : object + check_exact : bool, default True + Whether to compare number exactly. + check_dtype: bool, default True + check dtype if both a and b are the same type + check_less_precise : bool or int, default False + Specify comparison precision. Only used when check_exact is False. + 5 digits (False) or 3 digits (True) after decimal points are compared. + If int, then specify the digits to compare + """ if isinstance(left, pd.Index): return assert_index_equal(left, right, check_exact=check_exact, + exact=check_dtype, + check_less_precise=check_less_precise, **kwargs) elif isinstance(left, pd.Series): return assert_series_equal(left, right, check_exact=check_exact, + check_dtype=check_dtype, + check_less_precise=check_less_precise, **kwargs) elif isinstance(left, pd.DataFrame): return assert_frame_equal(left, right, check_exact=check_exact, + check_dtype=check_dtype, + check_less_precise=check_less_precise, **kwargs) - return _testing.assert_almost_equal(left, right, **kwargs) + else: + # other sequences + if check_dtype: + if is_number(left) and is_number(right): + # do not compare numeric classes, like np.float64 and float + pass + elif is_bool(left) and is_bool(right): + # do not compare bool classes, like np.bool_ and bool + pass + else: + if (isinstance(left, np.ndarray) or + isinstance(right, np.ndarray)): + obj = 'numpy array' + else: + obj = 'Input' + assert_class_equal(left, right, obj=obj) + return _testing.assert_almost_equal( + left, right, + check_dtype=check_dtype, + check_less_precise=check_less_precise, + **kwargs) + +def assert_dict_equal(left, right, compare_keys=True): -assert_dict_equal = _testing.assert_dict_equal + assertIsInstance(left, dict, '[dict] ') + assertIsInstance(right, dict, '[dict] ') + + return _testing.assert_dict_equal(left, right, compare_keys=compare_keys) def randbool(size=(), p=0.5): @@ -657,7 +706,7 @@ def assert_equal(a, b, msg=""): def assert_index_equal(left, right, exact='equiv', check_names=True, check_less_precise=False, check_exact=True, - obj='Index'): + check_categorical=True, obj='Index'): """Check that left and right Index are equal. Parameters @@ -670,11 +719,14 @@ def assert_index_equal(left, right, exact='equiv', check_names=True, Int64Index as well check_names : bool, default True Whether to check the names attribute. - check_less_precise : bool, default False + check_less_precise : bool or int, default False Specify comparison precision. Only used when check_exact is False. 5 digits (False) or 3 digits (True) after decimal points are compared. + If int, then specify the digits to compare check_exact : bool, default True Whether to compare number exactly. + check_categorical : bool, default True + Whether to compare internal Categorical exactly. obj : str, default 'Index' Specify object name being compared, internally used to show appropriate assertion message @@ -751,6 +803,13 @@ def _get_ilevel_values(index, level): # metadata comparison if check_names: assert_attr_equal('names', left, right, obj=obj) + if isinstance(left, pd.PeriodIndex) or isinstance(right, pd.PeriodIndex): + assert_attr_equal('freq', left, right, obj=obj) + + if check_categorical: + if is_categorical_dtype(left) or is_categorical_dtype(right): + assert_categorical_equal(left.values, right.values, + obj='{0} category'.format(obj)) def assert_class_equal(left, right, exact=True, obj='Input'): @@ -903,18 +962,17 @@ def assertNotIsInstance(obj, cls, msg=''): raise AssertionError(err_msg.format(msg, cls)) -def assert_categorical_equal(res, exp): - assertIsInstance(res, pd.Categorical, '[Categorical] ') - assertIsInstance(exp, pd.Categorical, '[Categorical] ') +def assert_categorical_equal(left, right, check_dtype=True, + obj='Categorical'): + assertIsInstance(left, pd.Categorical, '[Categorical] ') + assertIsInstance(right, pd.Categorical, '[Categorical] ') - assert_index_equal(res.categories, exp.categories) + assert_index_equal(left.categories, right.categories, + obj='{0}.categories'.format(obj)) + assert_numpy_array_equal(left.codes, right.codes, check_dtype=check_dtype, + obj='{0}.codes'.format(obj)) - if not array_equivalent(res.codes, exp.codes): - raise AssertionError( - 'codes not equivalent: {0} vs {1}.'.format(res.codes, exp.codes)) - - if res.ordered != exp.ordered: - raise AssertionError("ordered not the same") + assert_attr_equal('ordered', left, right, obj=obj) def raise_assert_detail(obj, message, left, right): @@ -951,33 +1009,29 @@ def assert_numpy_array_equal(left, right, strict_nan=False, assertion message """ + # instance validation + # to show a detailed erorr message when classes are different + assert_class_equal(left, right, obj=obj) + # both classes must be an np.ndarray + assertIsInstance(left, np.ndarray, '[ndarray] ') + assertIsInstance(right, np.ndarray, '[ndarray] ') + def _raise(left, right, err_msg): if err_msg is None: - # show detailed error - if lib.isscalar(left) and lib.isscalar(right): - # show scalar comparison error - assert_equal(left, right) - elif is_list_like(left) and is_list_like(right): - # some test cases pass list - left = np.asarray(left) - right = np.array(right) - - if left.shape != right.shape: - raise_assert_detail(obj, '{0} shapes are different' - .format(obj), left.shape, right.shape) - - diff = 0 - for l, r in zip(left, right): - # count up differences - if not array_equivalent(l, r, strict_nan=strict_nan): - diff += 1 - - diff = diff * 100.0 / left.size - msg = '{0} values are different ({1} %)'\ - .format(obj, np.round(diff, 5)) - raise_assert_detail(obj, msg, left, right) - else: - assert_class_equal(left, right, obj=obj) + if left.shape != right.shape: + raise_assert_detail(obj, '{0} shapes are different' + .format(obj), left.shape, right.shape) + + diff = 0 + for l, r in zip(left, right): + # count up differences + if not array_equivalent(l, r, strict_nan=strict_nan): + diff += 1 + + diff = diff * 100.0 / left.size + msg = '{0} values are different ({1} %)'\ + .format(obj, np.round(diff, 5)) + raise_assert_detail(obj, msg, left, right) raise AssertionError(err_msg) @@ -1000,6 +1054,7 @@ def assert_series_equal(left, right, check_dtype=True, check_names=True, check_exact=False, check_datetimelike_compat=False, + check_categorical=True, obj='Series'): """Check that left and right Series are equal. @@ -1015,15 +1070,18 @@ def assert_series_equal(left, right, check_dtype=True, are identical. check_series_type : bool, default False Whether to check the Series class is identical. - check_less_precise : bool, default False + check_less_precise : bool or int, default False Specify comparison precision. Only used when check_exact is False. 5 digits (False) or 3 digits (True) after decimal points are compared. + If int, then specify the digits to compare check_exact : bool, default False Whether to compare number exactly. check_names : bool, default True Whether to check the Series and Index names attribute. check_dateteimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. + check_categorical : bool, default True + Whether to compare internal Categorical exactly. obj : str, default 'Series' Specify object name being compared, internally used to show appropriate assertion message @@ -1050,6 +1108,7 @@ def assert_series_equal(left, right, check_dtype=True, check_names=check_names, check_less_precise=check_less_precise, check_exact=check_exact, + check_categorical=check_categorical, obj='{0}.index'.format(obj)) if check_dtype: @@ -1057,8 +1116,8 @@ def assert_series_equal(left, right, check_dtype=True, if check_exact: assert_numpy_array_equal(left.get_values(), right.get_values(), - obj='{0}'.format(obj), - check_dtype=check_dtype) + check_dtype=check_dtype, + obj='{0}'.format(obj),) elif check_datetimelike_compat: # we want to check only if we have compat dtypes # e.g. integer and M|m are NOT compat, but we can simply check @@ -1074,11 +1133,11 @@ def assert_series_equal(left, right, check_dtype=True, msg = '[datetimelike_compat=True] {0} is not equal to {1}.' raise AssertionError(msg.format(left.values, right.values)) else: - assert_numpy_array_equal(left.values, right.values, + assert_numpy_array_equal(left.get_values(), right.get_values(), check_dtype=check_dtype) else: _testing.assert_almost_equal(left.get_values(), right.get_values(), - check_less_precise, + check_less_precise=check_less_precise, check_dtype=check_dtype, obj='{0}'.format(obj)) @@ -1086,6 +1145,11 @@ def assert_series_equal(left, right, check_dtype=True, if check_names: assert_attr_equal('name', left, right, obj=obj) + if check_categorical: + if is_categorical_dtype(left) or is_categorical_dtype(right): + assert_categorical_equal(left.values, right.values, + obj='{0} category'.format(obj)) + # This could be refactored to use the NDFrame.equals method def assert_frame_equal(left, right, check_dtype=True, @@ -1097,6 +1161,7 @@ def assert_frame_equal(left, right, check_dtype=True, by_blocks=False, check_exact=False, check_datetimelike_compat=False, + check_categorical=True, check_like=False, obj='DataFrame'): @@ -1116,9 +1181,10 @@ def assert_frame_equal(left, right, check_dtype=True, are identical. check_frame_type : bool, default False Whether to check the DataFrame class is identical. - check_less_precise : bool, default False + check_less_precise : bool or it, default False Specify comparison precision. Only used when check_exact is False. 5 digits (False) or 3 digits (True) after decimal points are compared. + If int, then specify the digits to compare check_names : bool, default True Whether to check the Index names attribute. by_blocks : bool, default False @@ -1128,6 +1194,8 @@ def assert_frame_equal(left, right, check_dtype=True, Whether to compare number exactly. check_dateteimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. + check_categorical : bool, default True + Whether to compare internal Categorical exactly. check_like : bool, default False If true, then reindex_like operands obj : str, default 'DataFrame' @@ -1169,6 +1237,7 @@ def assert_frame_equal(left, right, check_dtype=True, check_names=check_names, check_less_precise=check_less_precise, check_exact=check_exact, + check_categorical=check_categorical, obj='{0}.index'.format(obj)) # column comparison @@ -1176,6 +1245,7 @@ def assert_frame_equal(left, right, check_dtype=True, check_names=check_names, check_less_precise=check_less_precise, check_exact=check_exact, + check_categorical=check_categorical, obj='{0}.columns'.format(obj)) # compare by blocks @@ -1200,6 +1270,7 @@ def assert_frame_equal(left, right, check_dtype=True, check_less_precise=check_less_precise, check_exact=check_exact, check_names=check_names, check_datetimelike_compat=check_datetimelike_compat, + check_categorical=check_categorical, obj='DataFrame.iloc[:, {0}]'.format(i)) @@ -1220,9 +1291,10 @@ def assert_panelnd_equal(left, right, Whether to check the Panel dtype is identical. check_panel_type : bool, default False Whether to check the Panel class is identical. - check_less_precise : bool, default False + check_less_precise : bool or int, default False Specify comparison precision. Only used when check_exact is False. 5 digits (False) or 3 digits (True) after decimal points are compared. + If int, then specify the digits to compare assert_func : function for comparing data check_names : bool, default True Whether to check the Index names attribute. @@ -1284,11 +1356,7 @@ def assert_sp_array_equal(left, right): raise_assert_detail('SparseArray.index', 'index are not equal', left.sp_index, right.sp_index) - if np.isnan(left.fill_value): - assert (np.isnan(right.fill_value)) - else: - assert (left.fill_value == right.fill_value) - + assert_attr_equal('fill_value', left, right) assert_attr_equal('dtype', left, right) assert_numpy_array_equal(left.values, right.values) diff --git a/pandas/util/validators.py b/pandas/util/validators.py index 2166dc45db605..bbfd24df9c13e 100644 --- a/pandas/util/validators.py +++ b/pandas/util/validators.py @@ -42,7 +42,16 @@ def _check_for_default_values(fname, arg_val_dict, compat_args): # as comparison may have been overriden for the left # hand object try: - match = (arg_val_dict[key] == compat_args[key]) + v1 = arg_val_dict[key] + v2 = compat_args[key] + + # check for None-ness otherwise we could end up + # comparing a numpy array vs None + if (v1 is not None and v2 is None) or \ + (v1 is None and v2 is not None): + match = False + else: + match = (v1 == v2) if not is_bool(match): raise ValueError("'match' is not a boolean")