From b042377e7392be3370ec883166b06b30ab5c57c1 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 25 Apr 2020 03:52:19 +0200 Subject: [PATCH 01/32] Add warnings when rows in csv file have too many values --- pandas/io/parsers.py | 17 ++++++++++++++--- pandas/tests/io/parser/test_common.py | 14 +++++++++++--- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 2df81ba0aa51a..dc49bd7705fb9 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -49,7 +49,7 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import CategoricalDtype -from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.missing import isna, notna from pandas.core import algorithms from pandas.core.arrays import Categorical @@ -2151,7 +2151,12 @@ def read(self, nrows=None): # columns as list alldata = [x[1] for x in data] - + if len(names) != len(data) and notna(data[len(names) :]).any(): + warnings.warn( + "Expected {} columns instead of {}".format(len(names), len(data)), + ParserWarning, + stacklevel=2, + ) data = {k: v for k, (i, v) in zip(names, data)} names, data = self._do_date_conversions(names, data) @@ -2159,7 +2164,6 @@ def read(self, nrows=None): # maybe create a mi on the columns names = self._maybe_make_multi_index_columns(names, self.col_names) - return index, names, data def _filter_usecols(self, names): @@ -2508,6 +2512,13 @@ def read(self, rows=None): content = content[1:] alldata = self._rows_to_cols(content) + if len(columns) != len(alldata) and notna(alldata[len(columns) :]).any(): + warnings.warn( + "Expected {} columns instead of {}".format(len(columns), len(alldata)), + ParserWarning, + stacklevel=2, + ) + data = self._exclude_implicit_index(alldata) columns = self._maybe_dedup_names(self.columns) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 5bf9587a6ca22..0af94de7c803a 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -14,7 +14,7 @@ import pytest from pandas._libs.tslib import Timestamp -from pandas.errors import DtypeWarning, EmptyDataError, ParserError +from pandas.errors import DtypeWarning, EmptyDataError, ParserError, ParserWarning import pandas.util._test_decorators as td from pandas import DataFrame, Index, MultiIndex, Series, compat, concat @@ -2124,5 +2124,13 @@ def test_no_header_two_extra_columns(all_parsers): ref = DataFrame([["foo", "bar", "baz"]], columns=column_names) stream = StringIO("foo,bar,baz,bam,blah") parser = all_parsers - df = parser.read_csv(stream, header=None, names=column_names, index_col=False) - tm.assert_frame_equal(df, ref) + with tm.assert_produces_warning(ParserWarning): + df = parser.read_csv(stream, header=None, names=column_names, index_col=False) + tm.assert_frame_equal(df, ref) + + +def test_first_row_length(all_parsers): + stream = StringIO("col1,col2,col3\n0,1,2,X\n4,5,6\n6,7,8") + parser = all_parsers + with tm.assert_produces_warning(ParserWarning): + df = parser.read_csv(stream, index_col=False) From 89a04c59562f958d8a33690a9cd3e2b0afebbbd7 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 25 Apr 2020 04:05:12 +0200 Subject: [PATCH 02/32] Remove unused variable --- pandas/tests/io/parser/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 0af94de7c803a..0bc4832143711 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2133,4 +2133,4 @@ def test_first_row_length(all_parsers): stream = StringIO("col1,col2,col3\n0,1,2,X\n4,5,6\n6,7,8") parser = all_parsers with tm.assert_produces_warning(ParserWarning): - df = parser.read_csv(stream, index_col=False) + parser.read_csv(stream, index_col=False) From 23c910928515d04ee544a8cfcaadae66087836ed Mon Sep 17 00:00:00 2001 From: mproszewska Date: Tue, 5 May 2020 02:19:36 +0200 Subject: [PATCH 03/32] Add helper function --- pandas/io/parsers.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index dc49bd7705fb9..428f7ffc1f5fd 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2151,12 +2151,8 @@ def read(self, nrows=None): # columns as list alldata = [x[1] for x in data] - if len(names) != len(data) and notna(data[len(names) :]).any(): - warnings.warn( - "Expected {} columns instead of {}".format(len(names), len(data)), - ParserWarning, - stacklevel=2, - ) + _check_unexpected_data(names, data) + data = {k: v for k, (i, v) in zip(names, data)} names, data = self._do_date_conversions(names, data) @@ -2192,6 +2188,15 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True): return values +def _check_unexpected_data(columns, data): + if len(columns) != len(data) and notna(data[len(columns) :]).any(): + warnings.warn( + "Expected {} columns instead of {}".format(len(columns), len(data)), + ParserWarning, + stacklevel=2, + ) + + def TextParser(*args, **kwds): """ Converts lists of lists/tuples into DataFrames with proper type inference @@ -2512,12 +2517,8 @@ def read(self, rows=None): content = content[1:] alldata = self._rows_to_cols(content) - if len(columns) != len(alldata) and notna(alldata[len(columns) :]).any(): - warnings.warn( - "Expected {} columns instead of {}".format(len(columns), len(alldata)), - ParserWarning, - stacklevel=2, - ) + + _check_unexpected_data(columns, alldata) data = self._exclude_implicit_index(alldata) From 77537c258c86a227ec82e5d79177b6e0fe7d47f9 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Tue, 5 May 2020 02:21:32 +0200 Subject: [PATCH 04/32] Add comma in test --- pandas/tests/io/parser/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 0bc4832143711..5d2adf290983c 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2130,7 +2130,7 @@ def test_no_header_two_extra_columns(all_parsers): def test_first_row_length(all_parsers): - stream = StringIO("col1,col2,col3\n0,1,2,X\n4,5,6\n6,7,8") + stream = StringIO("col1,col2,col3\n0,1,2,X\n4,5,6,\n6,7,8") parser = all_parsers with tm.assert_produces_warning(ParserWarning): parser.read_csv(stream, index_col=False) From 9bb7a86f251c6ae0664f95bb6a4af7d7ee22aea9 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sun, 10 May 2020 21:56:02 +0200 Subject: [PATCH 05/32] Include index_col and usecols in check --- pandas/io/parsers.py | 15 ++++++++++----- pandas/tests/io/parser/test_common.py | 6 +++--- pandas/tests/io/test_clipboard.py | 1 + 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index a7fb0e3fd1b28..402567f7a848d 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2151,7 +2151,8 @@ def read(self, nrows=None): # columns as list alldata = [x[1] for x in data] - _check_unexpected_data(names, data) + if self.usecols is None: + _check_unexpected_data(names, data, self.index_col) data = {k: v for k, (i, v) in zip(names, data)} @@ -2188,10 +2189,13 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True): return values -def _check_unexpected_data(columns, data): - if len(columns) != len(data) and notna(data[len(columns) :]).any(): +def _check_unexpected_data(columns, data, index_col): + if index_col is None or index_col is False: + index_col = [] + expected_columns = len(columns) + len(index_col) + if expected_columns != len(data) and notna(data[expected_columns :]).any(): warnings.warn( - "Expected {} columns instead of {}".format(len(columns), len(data)), + "Expected {} columns instead of {}".format(expected_columns, len(data)), ParserWarning, stacklevel=2, ) @@ -2518,7 +2522,8 @@ def read(self, rows=None): alldata = self._rows_to_cols(content) - _check_unexpected_data(columns, alldata) + if self.usecols is None: + _check_unexpected_data(columns, alldata, self.index_col) data = self._exclude_implicit_index(alldata) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index c7789f6eca113..946f38875cd32 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1070,8 +1070,8 @@ def test_trailing_delimiters(all_parsers): 4,5,6, 7,8,9,""" parser = all_parsers - result = parser.read_csv(StringIO(data), index_col=False) - + with tm.assert_produces_warning(ParserWarning): + result = parser.read_csv(StringIO(data), index_col=False) expected = DataFrame({"A": [1, 4, 7], "B": [2, 5, 8], "C": [3, 6, 9]}) tm.assert_frame_equal(result, expected) @@ -2135,7 +2135,7 @@ def test_no_header_two_extra_columns(all_parsers): parser = all_parsers with tm.assert_produces_warning(ParserWarning): df = parser.read_csv(stream, header=None, names=column_names, index_col=False) - tm.assert_frame_equal(df, ref) + tm.assert_frame_equal(df, ref) def test_first_row_length(all_parsers): diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index b627e0e1cad54..90ab515cf3a02 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -164,6 +164,7 @@ def test_round_trip_frame_sep(self, df, sep): # Test white space separator def test_round_trip_frame_string(self, df): df.to_clipboard(excel=False, sep=None) + result = read_clipboard() assert df.to_string() == result.to_string() assert df.shape == result.shape From 2d661e8c2350839e4b91c4eb1362a6e3cbe63dcf Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sun, 10 May 2020 21:58:05 +0200 Subject: [PATCH 06/32] Run black --- pandas/io/parsers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 402567f7a848d..81ef5272697c5 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2190,10 +2190,10 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True): def _check_unexpected_data(columns, data, index_col): - if index_col is None or index_col is False: + if index_col is None or index_col is False: index_col = [] expected_columns = len(columns) + len(index_col) - if expected_columns != len(data) and notna(data[expected_columns :]).any(): + if expected_columns != len(data) and notna(data[expected_columns:]).any(): warnings.warn( "Expected {} columns instead of {}".format(expected_columns, len(data)), ParserWarning, From 61d66ab3018401d31a9df71ec12bcb196b360035 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sun, 10 May 2020 22:34:56 +0200 Subject: [PATCH 07/32] Add docstring --- pandas/io/parsers.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 81ef5272697c5..15a5fb4d4cb61 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2190,6 +2190,19 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True): def _check_unexpected_data(columns, data, index_col): + """ + Checks whether or not ammount of columns in data matches expected number of columns. + Raises a warning if those numbers don't match. + + Parameters + ---------- + columns : list + List that contains columns names. + data : array-like + Object that contains column data. + index_col : list or False, optional + Columns to use as the index. + """ if index_col is None or index_col is False: index_col = [] expected_columns = len(columns) + len(index_col) From c94b45e1edd4494eee2a8885c25e041f6100eba6 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 15 May 2020 17:38:04 +0200 Subject: [PATCH 08/32] PERF: Remove unnecessary copies in sorting functions --- pandas/core/sorting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 25312b180dba1..da9cbe1023599 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -385,7 +385,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): from pandas.core.indexes.api import Index if not key: - return values.copy() + return values if isinstance(values, ABCMultiIndex): return ensure_key_mapped_multiindex(values, key, level=levels) From 0ab450b9ea5f38582d09acbcd8f697ac62f37919 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 16 May 2020 19:06:23 +0200 Subject: [PATCH 09/32] Run tests --- pandas/core/sorting.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index da9cbe1023599..2943714a5d015 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -386,7 +386,6 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): if not key: return values - if isinstance(values, ABCMultiIndex): return ensure_key_mapped_multiindex(values, key, level=levels) @@ -404,7 +403,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): else: type_of_values = type(values) result = type_of_values(result) # try to revert to original type otherwise - except TypeError: + except TypeError:opy() raise TypeError( f"User-provided `key` function returned an invalid type {type(result)} \ which could not be converted to {type(values)}." From 54c7304d585c60dd148e3e47aa28514100289eb5 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 16 May 2020 19:07:12 +0200 Subject: [PATCH 10/32] Run tests --- pandas/core/sorting.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 2943714a5d015..da9cbe1023599 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -386,6 +386,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): if not key: return values + if isinstance(values, ABCMultiIndex): return ensure_key_mapped_multiindex(values, key, level=levels) @@ -403,7 +404,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): else: type_of_values = type(values) result = type_of_values(result) # try to revert to original type otherwise - except TypeError:opy() + except TypeError: raise TypeError( f"User-provided `key` function returned an invalid type {type(result)} \ which could not be converted to {type(values)}." From e00993d6233d8c626ac4bc52e90adaadf1adcb66 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 22 May 2020 21:09:40 +0200 Subject: [PATCH 11/32] Move function --- pandas/io/parsers.py | 50 ++++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 15a5fb4d4cb61..74e3a4d5d6bc0 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1335,6 +1335,31 @@ def _validate_parse_dates_arg(parse_dates): return parse_dates +def _check_unexpected_data(columns, data, index_col): + """ + Checks whether or not ammount of columns in data matches expected number of columns. + Raises a warning if those numbers don't match. + + Parameters + ---------- + columns : list + List that contains columns names. + data : array-like + Object that contains column data. + index_col : list or False, optional + Columns to use as the index. + """ + if index_col is None or index_col is False: + index_col = [] + expected_columns = len(columns) + len(index_col) + if expected_columns != len(data) and notna(data[expected_columns:]).any(): + warnings.warn( + "Expected {} columns instead of {}".format(expected_columns, len(data)), + ParserWarning, + stacklevel=2, + ) + + class ParserBase: def __init__(self, kwds): self.names = kwds.get("names") @@ -2189,31 +2214,6 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True): return values -def _check_unexpected_data(columns, data, index_col): - """ - Checks whether or not ammount of columns in data matches expected number of columns. - Raises a warning if those numbers don't match. - - Parameters - ---------- - columns : list - List that contains columns names. - data : array-like - Object that contains column data. - index_col : list or False, optional - Columns to use as the index. - """ - if index_col is None or index_col is False: - index_col = [] - expected_columns = len(columns) + len(index_col) - if expected_columns != len(data) and notna(data[expected_columns:]).any(): - warnings.warn( - "Expected {} columns instead of {}".format(expected_columns, len(data)), - ParserWarning, - stacklevel=2, - ) - - def TextParser(*args, **kwds): """ Converts lists of lists/tuples into DataFrames with proper type inference From 6d72a346770fc93778a83e171daceec52b60e6d4 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 22 May 2020 23:19:58 +0200 Subject: [PATCH 12/32] Add asv --- asv_bench/benchmarks/algorithms.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 65e52e03c43c7..a96d9bc924308 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -174,4 +174,13 @@ def time_argsort(self, N): self.array.argsort() +class SortIndexSeries: + def setup(self): + N = 10 ** 5 + idx = pd.date_range(start="1/1/2000", periods=N, freq="s") + self.s = pd.Series(np.random.randn(N), index=idx) + + def time_sort_index(self): + self.s.sort_index() + from .pandas_vb_common import setup # noqa: F401 isort:skip From 5ba54a6039d3981a4187b38e11b479e53f8dcdd1 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 22 May 2020 23:20:53 +0200 Subject: [PATCH 13/32] Run black --- asv_bench/benchmarks/algorithms.py | 1 + 1 file changed, 1 insertion(+) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index a96d9bc924308..7afa97f9aa394 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -183,4 +183,5 @@ def setup(self): def time_sort_index(self): self.s.sort_index() + from .pandas_vb_common import setup # noqa: F401 isort:skip From 276627019d8000792473742c0a9036cf59b5f3cb Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 23 May 2020 00:28:24 +0200 Subject: [PATCH 14/32] Remove asv --- asv_bench/benchmarks/algorithms.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 7afa97f9aa394..65e52e03c43c7 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -174,14 +174,4 @@ def time_argsort(self, N): self.array.argsort() -class SortIndexSeries: - def setup(self): - N = 10 ** 5 - idx = pd.date_range(start="1/1/2000", periods=N, freq="s") - self.s = pd.Series(np.random.randn(N), index=idx) - - def time_sort_index(self): - self.s.sort_index() - - from .pandas_vb_common import setup # noqa: F401 isort:skip From 412cd45dadf37696ebfb0273d84197b076ae6be7 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Thu, 28 May 2020 16:36:56 +0200 Subject: [PATCH 15/32] Run tests --- pandas/io/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 74e3a4d5d6bc0..930ad9c1eff96 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1337,7 +1337,7 @@ def _validate_parse_dates_arg(parse_dates): def _check_unexpected_data(columns, data, index_col): """ - Checks whether or not ammount of columns in data matches expected number of columns. + Checks if ammount of columns in data matches expected number of columns. Raises a warning if those numbers don't match. Parameters From 4d7c568ddaf2cbd9563587602dc62ed5a60c3c04 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Mon, 1 Jun 2020 04:04:23 +0200 Subject: [PATCH 16/32] Remove newline --- pandas/tests/io/test_clipboard.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 90ab515cf3a02..b627e0e1cad54 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -164,7 +164,6 @@ def test_round_trip_frame_sep(self, df, sep): # Test white space separator def test_round_trip_frame_string(self, df): df.to_clipboard(excel=False, sep=None) - result = read_clipboard() assert df.to_string() == result.to_string() assert df.shape == result.shape From bbe77ca2903bfde75f5411df62e362857bee0e4f Mon Sep 17 00:00:00 2001 From: mproszewska Date: Wed, 3 Jun 2020 02:02:32 +0200 Subject: [PATCH 17/32] Fix --- pandas/core/sorting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index da9cbe1023599..25312b180dba1 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -385,7 +385,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): from pandas.core.indexes.api import Index if not key: - return values + return values.copy() if isinstance(values, ABCMultiIndex): return ensure_key_mapped_multiindex(values, key, level=levels) From d9aa31967e1286dc844773abb2be094a0b43a4ac Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 5 Jun 2020 03:51:05 +0200 Subject: [PATCH 18/32] Add asv --- asv_bench/benchmarks/io/excel.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 80af2cff41769..926286ee5fab2 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -11,7 +11,7 @@ def _generate_dataframe(): - N = 2000 + N = 20000 C = 5 df = DataFrame( np.random.randn(N, C), @@ -69,5 +69,9 @@ def time_read_excel(self, engine): fname = self.fname_odf if engine == "odf" else self.fname_excel read_excel(fname, engine=engine) + def nrows_read_excel(self, engine): + name = self.fname_odf if engine == "odf" else self.fname_excel + read_excel(fname, engine=engine, nrows=1) + from ..pandas_vb_common import setup # noqa: F401 isort:skip From 0afb1b14c359eece44f3885d5f20b40e07a9ccb6 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 5 Jun 2020 12:31:33 +0200 Subject: [PATCH 19/32] Fix --- asv_bench/benchmarks/io/excel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 926286ee5fab2..e9776ff2c641e 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -70,7 +70,7 @@ def time_read_excel(self, engine): read_excel(fname, engine=engine) def nrows_read_excel(self, engine): - name = self.fname_odf if engine == "odf" else self.fname_excel + fname = self.fname_odf if engine == "odf" else self.fname_excel read_excel(fname, engine=engine, nrows=1) From 35539d0f6da6fcdde349429ba7c41b9fe573e0ad Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 25 Apr 2020 03:52:19 +0200 Subject: [PATCH 20/32] Add warnings when rows in csv file have too many values --- pandas/io/parsers.py | 17 ++++++++++++++--- pandas/tests/io/parser/test_common.py | 14 +++++++++++--- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index aca2f9f5ac5bb..291d9296d188e 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -49,7 +49,7 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import CategoricalDtype -from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.missing import isna, notna from pandas.core import algorithms from pandas.core.arrays import Categorical @@ -2151,7 +2151,12 @@ def read(self, nrows=None): # columns as list alldata = [x[1] for x in data] - + if len(names) != len(data) and notna(data[len(names) :]).any(): + warnings.warn( + "Expected {} columns instead of {}".format(len(names), len(data)), + ParserWarning, + stacklevel=2, + ) data = {k: v for k, (i, v) in zip(names, data)} names, data = self._do_date_conversions(names, data) @@ -2159,7 +2164,6 @@ def read(self, nrows=None): # maybe create a mi on the columns names = self._maybe_make_multi_index_columns(names, self.col_names) - return index, names, data def _filter_usecols(self, names): @@ -2508,6 +2512,13 @@ def read(self, rows=None): content = content[1:] alldata = self._rows_to_cols(content) + if len(columns) != len(alldata) and notna(alldata[len(columns) :]).any(): + warnings.warn( + "Expected {} columns instead of {}".format(len(columns), len(alldata)), + ParserWarning, + stacklevel=2, + ) + data = self._exclude_implicit_index(alldata) columns = self._maybe_dedup_names(self.columns) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 55256499c6bb2..1db0323dc8e56 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -14,7 +14,7 @@ import pytest from pandas._libs.tslib import Timestamp -from pandas.errors import DtypeWarning, EmptyDataError, ParserError +from pandas.errors import DtypeWarning, EmptyDataError, ParserError, ParserWarning import pandas.util._test_decorators as td from pandas import DataFrame, Index, MultiIndex, Series, compat, concat @@ -2133,5 +2133,13 @@ def test_no_header_two_extra_columns(all_parsers): ref = DataFrame([["foo", "bar", "baz"]], columns=column_names) stream = StringIO("foo,bar,baz,bam,blah") parser = all_parsers - df = parser.read_csv(stream, header=None, names=column_names, index_col=False) - tm.assert_frame_equal(df, ref) + with tm.assert_produces_warning(ParserWarning): + df = parser.read_csv(stream, header=None, names=column_names, index_col=False) + tm.assert_frame_equal(df, ref) + + +def test_first_row_length(all_parsers): + stream = StringIO("col1,col2,col3\n0,1,2,X\n4,5,6\n6,7,8") + parser = all_parsers + with tm.assert_produces_warning(ParserWarning): + df = parser.read_csv(stream, index_col=False) From 358113b96faefa81ae24ca745b438226ea19616e Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 25 Apr 2020 04:05:12 +0200 Subject: [PATCH 21/32] Remove unused variable --- pandas/tests/io/parser/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 1db0323dc8e56..7b6bce74fd2e5 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2142,4 +2142,4 @@ def test_first_row_length(all_parsers): stream = StringIO("col1,col2,col3\n0,1,2,X\n4,5,6\n6,7,8") parser = all_parsers with tm.assert_produces_warning(ParserWarning): - df = parser.read_csv(stream, index_col=False) + parser.read_csv(stream, index_col=False) From ab224290f2e102103a38f8c75f55963d3f189c7c Mon Sep 17 00:00:00 2001 From: mproszewska Date: Tue, 5 May 2020 02:19:36 +0200 Subject: [PATCH 22/32] Add helper function --- pandas/io/parsers.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 291d9296d188e..a7fb0e3fd1b28 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2151,12 +2151,8 @@ def read(self, nrows=None): # columns as list alldata = [x[1] for x in data] - if len(names) != len(data) and notna(data[len(names) :]).any(): - warnings.warn( - "Expected {} columns instead of {}".format(len(names), len(data)), - ParserWarning, - stacklevel=2, - ) + _check_unexpected_data(names, data) + data = {k: v for k, (i, v) in zip(names, data)} names, data = self._do_date_conversions(names, data) @@ -2192,6 +2188,15 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True): return values +def _check_unexpected_data(columns, data): + if len(columns) != len(data) and notna(data[len(columns) :]).any(): + warnings.warn( + "Expected {} columns instead of {}".format(len(columns), len(data)), + ParserWarning, + stacklevel=2, + ) + + def TextParser(*args, **kwds): """ Converts lists of lists/tuples into DataFrames with proper type inference @@ -2512,12 +2517,8 @@ def read(self, rows=None): content = content[1:] alldata = self._rows_to_cols(content) - if len(columns) != len(alldata) and notna(alldata[len(columns) :]).any(): - warnings.warn( - "Expected {} columns instead of {}".format(len(columns), len(alldata)), - ParserWarning, - stacklevel=2, - ) + + _check_unexpected_data(columns, alldata) data = self._exclude_implicit_index(alldata) From 996213d56f2fc9cdba0d5f822f48e5e381098213 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Tue, 5 May 2020 02:21:32 +0200 Subject: [PATCH 23/32] Add comma in test --- pandas/tests/io/parser/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 7b6bce74fd2e5..c7789f6eca113 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2139,7 +2139,7 @@ def test_no_header_two_extra_columns(all_parsers): def test_first_row_length(all_parsers): - stream = StringIO("col1,col2,col3\n0,1,2,X\n4,5,6\n6,7,8") + stream = StringIO("col1,col2,col3\n0,1,2,X\n4,5,6,\n6,7,8") parser = all_parsers with tm.assert_produces_warning(ParserWarning): parser.read_csv(stream, index_col=False) From 17d9b1205e0467e72a712033c28da5032c26426e Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sun, 10 May 2020 21:56:02 +0200 Subject: [PATCH 24/32] Include index_col and usecols in check --- pandas/io/parsers.py | 15 ++++++++++----- pandas/tests/io/parser/test_common.py | 6 +++--- pandas/tests/io/test_clipboard.py | 1 + 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index a7fb0e3fd1b28..402567f7a848d 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2151,7 +2151,8 @@ def read(self, nrows=None): # columns as list alldata = [x[1] for x in data] - _check_unexpected_data(names, data) + if self.usecols is None: + _check_unexpected_data(names, data, self.index_col) data = {k: v for k, (i, v) in zip(names, data)} @@ -2188,10 +2189,13 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True): return values -def _check_unexpected_data(columns, data): - if len(columns) != len(data) and notna(data[len(columns) :]).any(): +def _check_unexpected_data(columns, data, index_col): + if index_col is None or index_col is False: + index_col = [] + expected_columns = len(columns) + len(index_col) + if expected_columns != len(data) and notna(data[expected_columns :]).any(): warnings.warn( - "Expected {} columns instead of {}".format(len(columns), len(data)), + "Expected {} columns instead of {}".format(expected_columns, len(data)), ParserWarning, stacklevel=2, ) @@ -2518,7 +2522,8 @@ def read(self, rows=None): alldata = self._rows_to_cols(content) - _check_unexpected_data(columns, alldata) + if self.usecols is None: + _check_unexpected_data(columns, alldata, self.index_col) data = self._exclude_implicit_index(alldata) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index c7789f6eca113..946f38875cd32 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1070,8 +1070,8 @@ def test_trailing_delimiters(all_parsers): 4,5,6, 7,8,9,""" parser = all_parsers - result = parser.read_csv(StringIO(data), index_col=False) - + with tm.assert_produces_warning(ParserWarning): + result = parser.read_csv(StringIO(data), index_col=False) expected = DataFrame({"A": [1, 4, 7], "B": [2, 5, 8], "C": [3, 6, 9]}) tm.assert_frame_equal(result, expected) @@ -2135,7 +2135,7 @@ def test_no_header_two_extra_columns(all_parsers): parser = all_parsers with tm.assert_produces_warning(ParserWarning): df = parser.read_csv(stream, header=None, names=column_names, index_col=False) - tm.assert_frame_equal(df, ref) + tm.assert_frame_equal(df, ref) def test_first_row_length(all_parsers): diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index b627e0e1cad54..90ab515cf3a02 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -164,6 +164,7 @@ def test_round_trip_frame_sep(self, df, sep): # Test white space separator def test_round_trip_frame_string(self, df): df.to_clipboard(excel=False, sep=None) + result = read_clipboard() assert df.to_string() == result.to_string() assert df.shape == result.shape From 44a5da5b5296c1754fee978e000e0bcf0133a2f2 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sun, 10 May 2020 21:58:05 +0200 Subject: [PATCH 25/32] Run black --- pandas/io/parsers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 402567f7a848d..81ef5272697c5 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2190,10 +2190,10 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True): def _check_unexpected_data(columns, data, index_col): - if index_col is None or index_col is False: + if index_col is None or index_col is False: index_col = [] expected_columns = len(columns) + len(index_col) - if expected_columns != len(data) and notna(data[expected_columns :]).any(): + if expected_columns != len(data) and notna(data[expected_columns:]).any(): warnings.warn( "Expected {} columns instead of {}".format(expected_columns, len(data)), ParserWarning, From c191274650a736168e230b25a40149c2b4a52d5f Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sun, 10 May 2020 22:34:56 +0200 Subject: [PATCH 26/32] Add docstring --- pandas/io/parsers.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 81ef5272697c5..15a5fb4d4cb61 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2190,6 +2190,19 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True): def _check_unexpected_data(columns, data, index_col): + """ + Checks whether or not ammount of columns in data matches expected number of columns. + Raises a warning if those numbers don't match. + + Parameters + ---------- + columns : list + List that contains columns names. + data : array-like + Object that contains column data. + index_col : list or False, optional + Columns to use as the index. + """ if index_col is None or index_col is False: index_col = [] expected_columns = len(columns) + len(index_col) From 0567294975a7ce97a41f09307c7a47ed20d9e7a2 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 22 May 2020 21:09:40 +0200 Subject: [PATCH 27/32] Move function --- pandas/io/parsers.py | 50 ++++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 15a5fb4d4cb61..74e3a4d5d6bc0 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1335,6 +1335,31 @@ def _validate_parse_dates_arg(parse_dates): return parse_dates +def _check_unexpected_data(columns, data, index_col): + """ + Checks whether or not ammount of columns in data matches expected number of columns. + Raises a warning if those numbers don't match. + + Parameters + ---------- + columns : list + List that contains columns names. + data : array-like + Object that contains column data. + index_col : list or False, optional + Columns to use as the index. + """ + if index_col is None or index_col is False: + index_col = [] + expected_columns = len(columns) + len(index_col) + if expected_columns != len(data) and notna(data[expected_columns:]).any(): + warnings.warn( + "Expected {} columns instead of {}".format(expected_columns, len(data)), + ParserWarning, + stacklevel=2, + ) + + class ParserBase: def __init__(self, kwds): self.names = kwds.get("names") @@ -2189,31 +2214,6 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True): return values -def _check_unexpected_data(columns, data, index_col): - """ - Checks whether or not ammount of columns in data matches expected number of columns. - Raises a warning if those numbers don't match. - - Parameters - ---------- - columns : list - List that contains columns names. - data : array-like - Object that contains column data. - index_col : list or False, optional - Columns to use as the index. - """ - if index_col is None or index_col is False: - index_col = [] - expected_columns = len(columns) + len(index_col) - if expected_columns != len(data) and notna(data[expected_columns:]).any(): - warnings.warn( - "Expected {} columns instead of {}".format(expected_columns, len(data)), - ParserWarning, - stacklevel=2, - ) - - def TextParser(*args, **kwds): """ Converts lists of lists/tuples into DataFrames with proper type inference From 31c9bd0387d2da882a94d4717afe6721ac91d413 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Thu, 28 May 2020 16:36:56 +0200 Subject: [PATCH 28/32] Run tests --- pandas/io/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 74e3a4d5d6bc0..930ad9c1eff96 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1337,7 +1337,7 @@ def _validate_parse_dates_arg(parse_dates): def _check_unexpected_data(columns, data, index_col): """ - Checks whether or not ammount of columns in data matches expected number of columns. + Checks if ammount of columns in data matches expected number of columns. Raises a warning if those numbers don't match. Parameters From 9a8449807920a5f388fffbe2b9f160bf3e0d0c56 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Mon, 1 Jun 2020 04:04:23 +0200 Subject: [PATCH 29/32] Remove newline --- pandas/tests/io/test_clipboard.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 90ab515cf3a02..b627e0e1cad54 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -164,7 +164,6 @@ def test_round_trip_frame_sep(self, df, sep): # Test white space separator def test_round_trip_frame_string(self, df): df.to_clipboard(excel=False, sep=None) - result = read_clipboard() assert df.to_string() == result.to_string() assert df.shape == result.shape From 459250bbca948cee5195ced6c3aae5556d06b771 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Wed, 3 Jun 2020 02:02:32 +0200 Subject: [PATCH 30/32] Fix --- pandas/core/sorting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index da9cbe1023599..25312b180dba1 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -385,7 +385,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): from pandas.core.indexes.api import Index if not key: - return values + return values.copy() if isinstance(values, ABCMultiIndex): return ensure_key_mapped_multiindex(values, key, level=levels) From cd1239ff0df162ac00c1b90c308c2715ae2eeb44 Mon Sep 17 00:00:00 2001 From: Magdalena Proszewska Date: Thu, 8 Oct 2020 21:20:10 +0200 Subject: [PATCH 31/32] Resolve conflicts --- pandas/tests/io/parser/test_common.py | 59 +++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 946f38875cd32..d775abca32550 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2137,6 +2137,65 @@ def test_no_header_two_extra_columns(all_parsers): df = parser.read_csv(stream, header=None, names=column_names, index_col=False) tm.assert_frame_equal(df, ref) +def test_read_csv_names_not_accepting_sets(all_parsers): + # GH 34946 + data = """\ + 1,2,3 + 4,5,6\n""" + parser = all_parsers + with pytest.raises(ValueError, match="Names should be an ordered collection."): + parser.read_csv(StringIO(data), names=set("QAZ")) + + +def test_read_csv_with_use_inf_as_na(all_parsers): + # https://github.com/pandas-dev/pandas/issues/35493 + parser = all_parsers + data = "1.0\nNaN\n3.0" + with option_context("use_inf_as_na", True): + result = parser.read_csv(StringIO(data), header=None) + expected = DataFrame([1.0, np.nan, 3.0]) + tm.assert_frame_equal(result, expected) + + +def test_read_table_delim_whitespace_default_sep(all_parsers): + # GH: 35958 + f = StringIO("a b c\n1 -2 -3\n4 5 6") + parser = all_parsers + result = parser.read_table(f, delim_whitespace=True) + expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("delimiter", [",", "\t"]) +def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter): + # GH: 35958 + f = StringIO("a b c\n1 -2 -3\n4 5 6") + parser = all_parsers + msg = ( + "Specified a delimiter with both sep and " + "delim_whitespace=True; you can only specify one." + ) + with pytest.raises(ValueError, match=msg): + parser.read_csv(f, delim_whitespace=True, sep=delimiter) + + with pytest.raises(ValueError, match=msg): + parser.read_csv(f, delim_whitespace=True, delimiter=delimiter) + + +@pytest.mark.parametrize("delimiter", [",", "\t"]) +def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter): + # GH: 35958 + f = StringIO("a b c\n1 -2 -3\n4 5 6") + parser = all_parsers + msg = ( + "Specified a delimiter with both sep and " + "delim_whitespace=True; you can only specify one." + ) + with pytest.raises(ValueError, match=msg): + parser.read_table(f, delim_whitespace=True, sep=delimiter) + + with pytest.raises(ValueError, match=msg): + parser.read_table(f, delim_whitespace=True, delimiter=delimiter) def test_first_row_length(all_parsers): stream = StringIO("col1,col2,col3\n0,1,2,X\n4,5,6,\n6,7,8") From 18f3767315ba3958784dad214dfa95328a45db27 Mon Sep 17 00:00:00 2001 From: Magdalena Proszewska Date: Thu, 8 Oct 2020 21:27:02 +0200 Subject: [PATCH 32/32] Run black --- pandas/tests/io/parser/test_common.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 87048943733cd..f199267d0d462 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2182,7 +2182,7 @@ def test_no_header_two_extra_columns(all_parsers): df = parser.read_csv(stream, header=None, names=column_names, index_col=False) tm.assert_frame_equal(df, ref) - + def test_read_csv_names_not_accepting_sets(all_parsers): # GH 34946 data = """\ @@ -2243,8 +2243,9 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter): with pytest.raises(ValueError, match=msg): parser.read_table(f, delim_whitespace=True, delimiter=delimiter) + def test_first_row_length(all_parsers): stream = StringIO("col1,col2,col3\n0,1,2,X\n4,5,6,\n6,7,8") parser = all_parsers with tm.assert_produces_warning(ParserWarning): - parser.read_csv(stream, index_col=False) \ No newline at end of file + parser.read_csv(stream, index_col=False)