From c94b45e1edd4494eee2a8885c25e041f6100eba6 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 15 May 2020 17:38:04 +0200 Subject: [PATCH 01/14] PERF: Remove unnecessary copies in sorting functions --- pandas/core/sorting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 25312b180dba1..da9cbe1023599 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -385,7 +385,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): from pandas.core.indexes.api import Index if not key: - return values.copy() + return values if isinstance(values, ABCMultiIndex): return ensure_key_mapped_multiindex(values, key, level=levels) From 0ab450b9ea5f38582d09acbcd8f697ac62f37919 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 16 May 2020 19:06:23 +0200 Subject: [PATCH 02/14] Run tests --- pandas/core/sorting.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index da9cbe1023599..2943714a5d015 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -386,7 +386,6 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): if not key: return values - if isinstance(values, ABCMultiIndex): return ensure_key_mapped_multiindex(values, key, level=levels) @@ -404,7 +403,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): else: type_of_values = type(values) result = type_of_values(result) # try to revert to original type otherwise - except TypeError: + except TypeError:opy() raise TypeError( f"User-provided `key` function returned an invalid type {type(result)} \ which could not be converted to {type(values)}." From 54c7304d585c60dd148e3e47aa28514100289eb5 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 16 May 2020 19:07:12 +0200 Subject: [PATCH 03/14] Run tests --- pandas/core/sorting.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 2943714a5d015..da9cbe1023599 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -386,6 +386,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): if not key: return values + if isinstance(values, ABCMultiIndex): return ensure_key_mapped_multiindex(values, key, level=levels) @@ -403,7 +404,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): else: type_of_values = type(values) result = type_of_values(result) # try to revert to original type otherwise - except TypeError:opy() + except TypeError: raise TypeError( f"User-provided `key` function returned an invalid type {type(result)} \ which could not be converted to {type(values)}." From 6d72a346770fc93778a83e171daceec52b60e6d4 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 22 May 2020 23:19:58 +0200 Subject: [PATCH 04/14] Add asv --- asv_bench/benchmarks/algorithms.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 65e52e03c43c7..a96d9bc924308 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -174,4 +174,13 @@ def time_argsort(self, N): self.array.argsort() +class SortIndexSeries: + def setup(self): + N = 10 ** 5 + idx = pd.date_range(start="1/1/2000", periods=N, freq="s") + self.s = pd.Series(np.random.randn(N), index=idx) + + def time_sort_index(self): + self.s.sort_index() + from .pandas_vb_common import setup # noqa: F401 isort:skip From 5ba54a6039d3981a4187b38e11b479e53f8dcdd1 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 22 May 2020 23:20:53 +0200 Subject: [PATCH 05/14] Run black --- asv_bench/benchmarks/algorithms.py | 1 + 1 file changed, 1 insertion(+) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index a96d9bc924308..7afa97f9aa394 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -183,4 +183,5 @@ def setup(self): def time_sort_index(self): self.s.sort_index() + from .pandas_vb_common import setup # noqa: F401 isort:skip From 276627019d8000792473742c0a9036cf59b5f3cb Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 23 May 2020 00:28:24 +0200 Subject: [PATCH 06/14] Remove asv --- asv_bench/benchmarks/algorithms.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 7afa97f9aa394..65e52e03c43c7 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -174,14 +174,4 @@ def time_argsort(self, N): self.array.argsort() -class SortIndexSeries: - def setup(self): - N = 10 ** 5 - idx = pd.date_range(start="1/1/2000", periods=N, freq="s") - self.s = pd.Series(np.random.randn(N), index=idx) - - def time_sort_index(self): - self.s.sort_index() - - from .pandas_vb_common import setup # noqa: F401 isort:skip From b800207c3f4369eb43526d611fbd8303a774ea48 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 23 May 2020 03:42:34 +0200 Subject: [PATCH 07/14] BUG: Fix using dtype with parse_dates in read_csv --- pandas/io/parsers.py | 7 ++++++- pandas/tests/io/parser/test_common.py | 14 ++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index aca2f9f5ac5bb..be8c3a44a57bf 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1708,7 +1708,9 @@ def _convert_to_ndarrays( result = {} for c, values in dct.items(): conv_f = None if converters is None else converters.get(c, None) - if isinstance(dtypes, dict): + if values.dtype != object: + cast_type = values.dtype + elif isinstance(dtypes, dict): cast_type = dtypes.get(c, None) else: # single dtype or None @@ -3264,6 +3266,9 @@ def _make_date_converter( ): def converter(*date_cols): if date_parser is None: + date_cols = tuple( + [x if isinstance(x, np.ndarray) else x.to_numpy() for x in date_cols] + ) strs = parsing.concat_date_cols(date_cols) try: diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 55256499c6bb2..b6c143c298b25 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2135,3 +2135,17 @@ def test_no_header_two_extra_columns(all_parsers): parser = all_parsers df = parser.read_csv(stream, header=None, names=column_names, index_col=False) tm.assert_frame_equal(df, ref) + + +def test_dtype_with_parse_dates(all_parsers): + # GH 34066 + parser = all_parsers + data = """ +a,b +1,2020-05-23 01:00:00""" + expected = DataFrame( + [["1", "2020-05-23 01:00:00"]], columns=["a", "b"], dtype="string" + ) + expected = expected.astype({"b": np.datetime64}) + df = parser.read_csv(StringIO(data), dtype="string", parse_dates=["b"]) + tm.assert_frame_equal(df, expected) From 6b8f5627900f35dff4566b4cdee2d85d658b0d78 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 23 May 2020 23:23:08 +0200 Subject: [PATCH 08/14] Fix lint --- pandas/io/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index be8c3a44a57bf..3cdd733581bcd 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -3267,7 +3267,7 @@ def _make_date_converter( def converter(*date_cols): if date_parser is None: date_cols = tuple( - [x if isinstance(x, np.ndarray) else x.to_numpy() for x in date_cols] + x if isinstance(x, np.ndarray) else x.to_numpy() for x in date_cols ) strs = parsing.concat_date_cols(date_cols) From 3c13f59037412487364d625d66a775c4def07203 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Thu, 28 May 2020 19:04:36 +0200 Subject: [PATCH 09/14] Modify test --- pandas/tests/io/parser/test_common.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index b6c143c298b25..e8fe3ce58ba66 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -17,7 +17,7 @@ from pandas.errors import DtypeWarning, EmptyDataError, ParserError import pandas.util._test_decorators as td -from pandas import DataFrame, Index, MultiIndex, Series, compat, concat +from pandas import DataFrame, Index, MultiIndex, Series, compat, concat, to_datetime import pandas._testing as tm from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser @@ -2146,6 +2146,6 @@ def test_dtype_with_parse_dates(all_parsers): expected = DataFrame( [["1", "2020-05-23 01:00:00"]], columns=["a", "b"], dtype="string" ) - expected = expected.astype({"b": np.datetime64}) - df = parser.read_csv(StringIO(data), dtype="string", parse_dates=["b"]) - tm.assert_frame_equal(df, expected) + expected['b'] = to_datetime(expected['b']) + result = parser.read_csv(StringIO(data), dtype="string", parse_dates=["b"]) + tm.assert_frame_equal(result, expected) From d9aa31967e1286dc844773abb2be094a0b43a4ac Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 5 Jun 2020 03:51:05 +0200 Subject: [PATCH 10/14] Add asv --- asv_bench/benchmarks/io/excel.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 80af2cff41769..926286ee5fab2 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -11,7 +11,7 @@ def _generate_dataframe(): - N = 2000 + N = 20000 C = 5 df = DataFrame( np.random.randn(N, C), @@ -69,5 +69,9 @@ def time_read_excel(self, engine): fname = self.fname_odf if engine == "odf" else self.fname_excel read_excel(fname, engine=engine) + def nrows_read_excel(self, engine): + name = self.fname_odf if engine == "odf" else self.fname_excel + read_excel(fname, engine=engine, nrows=1) + from ..pandas_vb_common import setup # noqa: F401 isort:skip From 0afb1b14c359eece44f3885d5f20b40e07a9ccb6 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 5 Jun 2020 12:31:33 +0200 Subject: [PATCH 11/14] Fix --- asv_bench/benchmarks/io/excel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 926286ee5fab2..e9776ff2c641e 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -70,7 +70,7 @@ def time_read_excel(self, engine): read_excel(fname, engine=engine) def nrows_read_excel(self, engine): - name = self.fname_odf if engine == "odf" else self.fname_excel + fname = self.fname_odf if engine == "odf" else self.fname_excel read_excel(fname, engine=engine, nrows=1) From 85dd0d6638ae91d7d4e1656e602a86971ca1fdbe Mon Sep 17 00:00:00 2001 From: mproszewska Date: Mon, 8 Jun 2020 17:16:19 +0200 Subject: [PATCH 12/14] Run black --- pandas/tests/io/parser/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index e8fe3ce58ba66..26b99c3a86d34 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2146,6 +2146,6 @@ def test_dtype_with_parse_dates(all_parsers): expected = DataFrame( [["1", "2020-05-23 01:00:00"]], columns=["a", "b"], dtype="string" ) - expected['b'] = to_datetime(expected['b']) + expected["b"] = to_datetime(expected["b"]) result = parser.read_csv(StringIO(data), dtype="string", parse_dates=["b"]) tm.assert_frame_equal(result, expected) From 4175b80077322d186eb634a0db8d840782c69c17 Mon Sep 17 00:00:00 2001 From: Magdalena Proszewska Date: Thu, 8 Oct 2020 22:29:02 +0200 Subject: [PATCH 13/14] Run black --- pandas/tests/io/parser/test_common.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index a9ae088e2eec9..02e650d8695d1 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -18,7 +18,16 @@ from pandas.errors import DtypeWarning, EmptyDataError, ParserError import pandas.util._test_decorators as td -from pandas import DataFrame, Index, MultiIndex, Series, compat, concat, option_context, to_datetime +from pandas import ( + DataFrame, + Index, + MultiIndex, + Series, + compat, + concat, + option_context, + to_datetime, +) import pandas._testing as tm @@ -2196,7 +2205,7 @@ def test_dtype_with_parse_dates(all_parsers): result = parser.read_csv(StringIO(data), dtype="string", parse_dates=["b"]) tm.assert_frame_equal(result, expected) - + def test_read_csv_names_not_accepting_sets(all_parsers): # GH 34946 data = """\ @@ -2255,4 +2264,4 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter): parser.read_table(f, delim_whitespace=True, sep=delimiter) with pytest.raises(ValueError, match=msg): - parser.read_table(f, delim_whitespace=True, delimiter=delimiter) \ No newline at end of file + parser.read_table(f, delim_whitespace=True, delimiter=delimiter) From 6a782206a29f422266f1216f7496febfec3c5d93 Mon Sep 17 00:00:00 2001 From: Magdalena Proszewska Date: Thu, 8 Oct 2020 22:36:22 +0200 Subject: [PATCH 14/14] Run isort --- pandas/tests/io/parser/test_common.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 02e650d8695d1..e48f205527677 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -28,7 +28,6 @@ option_context, to_datetime, ) - import pandas._testing as tm from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser