From c38e1ce3ec8a4c2da9f7abe7b9c8fab4cdc9cb50 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <> Date: Wed, 25 Dec 2019 12:42:24 +0200 Subject: [PATCH 1/7] CI: Added unwanted patterns check --- ci/code_checks.sh | 4 ++ scripts/validate_string_concatenation.py | 73 ++++++++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100755 scripts/validate_string_concatenation.py diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 94eaab0a5b4da..462275d9b5bec 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -207,6 +207,10 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then invgrep -R --include=*.{py,pyx} 'xrange' pandas RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check for use of not concatenated strings' ; echo $MSG + python ./scripts/validate_string_concatenation.py pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check that no file in the repo contains trailing whitespaces' ; echo $MSG INVGREP_APPEND=" <- trailing whitespaces found" invgrep -RI --exclude=\*.{svg,c,cpp,html,js} --exclude-dir=env "\s$" * diff --git a/scripts/validate_string_concatenation.py b/scripts/validate_string_concatenation.py new file mode 100755 index 0000000000000..a262bbe583380 --- /dev/null +++ b/scripts/validate_string_concatenation.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python +""" +Check where there is a string that needs to be concatenated. +""" + +import os +import sys +import token +import tokenize + +FILE_EXTENTIONS_TO_CHECK = [".py", ".pyx"] + + +def main(): + path = sys.argv[1] + + if not os.path.exists(path): + raise ValueError("Please enter a valid path, to a file/directory.") + + if os.path.isfile(path): + # Means that the given path is of a single file. + sys.exit(is_concatenated(path)) + + status_codes = set() + # Means that the given path is of a directory. + for subdir, _, files in os.walk(path): + for file_name in files: + ext = os.path.splitext(os.path.join(subdir, file_name))[1] + if ext in FILE_EXTENTIONS_TO_CHECK: + status_codes.add(is_concatenated(os.path.join(subdir, file_name))) + + if 1 in status_codes: + sys.exit(1) + + sys.exit(0) + + +def is_concatenated(file_path): + """ + Checking if the file containing strings that needs to be concatenated. + + Parameters + ---------- + file_path : str + File path pointing to a single file. + + Returns + ------- + int + Status code representing if the file needs a fix. + 0 - All good. + 1 - Needs to be fixed. + """ + with open(file_path, "r") as file_name: + toks = list(tokenize.generate_tokens(file_name.readline)) + for i in range(len(toks) - 1): + tok = toks[i] + tok2 = toks[i + 1] + if tok[0] == token.STRING and tok[0] == tok2[0]: + print( + "{file_path}:{line_number}:\t{start} and {end}".format( + file_path=file_path, + line_number=tok[2][0], + start=tok[1], + end=tok2[1], + ) + ) + return 1 + return 0 + + +if __name__ == "__main__": + main() From 207b20d3263974d0e29efb5e057eecdca72abcdb Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <> Date: Wed, 25 Dec 2019 14:38:13 +0200 Subject: [PATCH 2/7] Removed unconcatenated strings --- pandas/core/arrays/datetimelike.py | 4 +--- pandas/core/dtypes/cast.py | 8 ++------ pandas/io/json/_json.py | 2 +- pandas/io/json/_table_schema.py | 8 +++----- pandas/tests/arithmetic/test_period.py | 6 ++---- pandas/tests/arrays/test_timedeltas.py | 5 ++--- pandas/tests/base/test_ops.py | 4 +--- pandas/tests/frame/test_constructors.py | 4 ++-- pandas/tests/frame/test_missing.py | 2 +- pandas/tests/indexes/datetimes/test_tools.py | 2 +- pandas/tests/indexing/test_loc.py | 4 +--- pandas/tests/io/formats/test_to_csv.py | 10 +++------- pandas/tests/io/json/test_pandas.py | 2 +- pandas/tests/io/json/test_readlines.py | 2 +- pandas/tests/io/json/test_ujson.py | 6 +++--- pandas/tests/io/parser/test_common.py | 5 ++--- pandas/tests/io/parser/test_textreader.py | 2 +- pandas/tests/io/pytables/test_store.py | 2 +- pandas/tests/io/test_common.py | 8 ++------ pandas/tests/reshape/merge/test_join.py | 12 +++--------- pandas/tests/reshape/merge/test_merge.py | 2 +- pandas/tests/series/test_constructors.py | 4 ++-- pandas/tests/series/test_dtypes.py | 2 +- pandas/tests/series/test_missing.py | 8 ++++---- pandas/tests/tslibs/test_parse_iso8601.py | 4 +--- pandas/tests/util/test_validate_args_and_kwargs.py | 4 +--- pandas/tests/util/test_validate_kwargs.py | 4 +--- pandas/util/_test_decorators.py | 2 +- pandas/util/_validators.py | 6 ++---- 29 files changed, 48 insertions(+), 86 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 045e511e32586..c37bd01d5fe30 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -916,9 +916,7 @@ def _is_unique(self): def _add_datetimelike_scalar(self, other): # Overriden by TimedeltaArray - raise TypeError( - f"cannot add {type(self).__name__} and " f"{type(other).__name__}" - ) + raise TypeError(f"cannot add {type(self).__name__} and {type(other).__name__}") _add_datetime_arraylike = _add_datetimelike_scalar diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 1ab21f18f3bdc..946070f8fad98 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -820,9 +820,7 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): if dtype.kind == "M": return arr.astype(dtype) - raise TypeError( - f"cannot astype a datetimelike from [{arr.dtype}] " f"to [{dtype}]" - ) + raise TypeError(f"cannot astype a datetimelike from [{arr.dtype}] to [{dtype}]") elif is_timedelta64_dtype(arr): if is_object_dtype(dtype): @@ -842,9 +840,7 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): elif dtype == _TD_DTYPE: return arr.astype(_TD_DTYPE, copy=copy) - raise TypeError( - f"cannot astype a timedelta from [{arr.dtype}] " f"to [{dtype}]" - ) + raise TypeError(f"cannot astype a timedelta from [{arr.dtype}] to [{dtype}]") elif np.issubdtype(arr.dtype, np.floating) and np.issubdtype(dtype, np.integer): diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index c22089b4e1eae..4b0c0389f1439 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -53,7 +53,7 @@ def to_json( if not index and orient not in ["split", "table"]: raise ValueError( - "'index=False' is only valid when 'orient' is " "'split' or 'table'" + "'index=False' is only valid when 'orient' is 'split' or 'table'" ) path_or_buf = stringify_path(path_or_buf) diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index bc5a9783391a4..87bfd6030ec31 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -81,9 +81,7 @@ def set_default_names(data): if len(nms) == 1 and data.index.name == "index": warnings.warn("Index name of 'index' is not round-trippable") elif len(nms) > 1 and any(x.startswith("level_") for x in nms): - warnings.warn( - "Index names beginning with 'level_' are not " "round-trippable" - ) + warnings.warn("Index names beginning with 'level_' are not round-trippable") return data data = data.copy() @@ -317,12 +315,12 @@ def parse_table_schema(json, precise_float): # Cannot directly use as_type with timezone data on object; raise for now if any(str(x).startswith("datetime64[ns, ") for x in dtypes.values()): - raise NotImplementedError('table="orient" can not yet read timezone ' "data") + raise NotImplementedError('table="orient" can not yet read timezone data') # No ISO constructor for Timedelta as of yet, so need to raise if "timedelta64" in dtypes.values(): raise NotImplementedError( - 'table="orient" can not yet read ' "ISO-formatted Timedelta data" + 'table="orient" can not yet read ISO-formatted Timedelta data' ) df = df.astype(dtypes) diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index ed693d873efb8..5917c8deee8a9 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -168,9 +168,7 @@ def test_parr_cmp_pi_mismatched_freq_raises(self, freq, box_with_array): # TODO: Could parametrize over boxes for idx? idx = PeriodIndex(["2011", "2012", "2013", "2014"], freq="A") - rev_msg = ( - r"Input has different freq=(M|2M|3M) from " r"PeriodArray\(freq=A-DEC\)" - ) + rev_msg = r"Input has different freq=(M|2M|3M) from PeriodArray\(freq=A-DEC\)" idx_msg = rev_msg if box_with_array is tm.to_array else msg with pytest.raises(IncompatibleFrequency, match=idx_msg): base <= idx @@ -184,7 +182,7 @@ def test_parr_cmp_pi_mismatched_freq_raises(self, freq, box_with_array): Period("2011", freq="4M") >= base idx = PeriodIndex(["2011", "2012", "2013", "2014"], freq="4M") - rev_msg = r"Input has different freq=(M|2M|3M) from " r"PeriodArray\(freq=4M\)" + rev_msg = r"Input has different freq=(M|2M|3M) from PeriodArray\(freq=4M\)" idx_msg = rev_msg if box_with_array is tm.to_array else msg with pytest.raises(IncompatibleFrequency, match=idx_msg): base <= idx diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py index 42e7bee97e671..bb6ef09bad17e 100644 --- a/pandas/tests/arrays/test_timedeltas.py +++ b/pandas/tests/arrays/test_timedeltas.py @@ -41,13 +41,12 @@ def test_other_type_raises(self): def test_incorrect_dtype_raises(self): # TODO: why TypeError for 'category' but ValueError for i8? with pytest.raises( - ValueError, match=r"category cannot be converted " r"to timedelta64\[ns\]" + ValueError, match=r"category cannot be converted to timedelta64\[ns\]" ): TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype="category") with pytest.raises( - ValueError, - match=r"dtype int64 cannot be converted " r"to timedelta64\[ns\]", + ValueError, match=r"dtype int64 cannot be converted to timedelta64\[ns\]", ): TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("int64")) diff --git a/pandas/tests/base/test_ops.py b/pandas/tests/base/test_ops.py index 04277ce929bca..4231aa844f282 100644 --- a/pandas/tests/base/test_ops.py +++ b/pandas/tests/base/test_ops.py @@ -698,9 +698,7 @@ def test_duplicated_drop_duplicates_index(self): with pytest.raises( TypeError, - match=( - r"drop_duplicates\(\) got an " r"unexpected keyword argument" - ), + match=r"drop_duplicates\(\) got an unexpected keyword argument", ): idx.drop_duplicates(inplace=True) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 3e5027ee54cb3..f3cc11cb7027d 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -479,11 +479,11 @@ def test_constructor_error_msgs(self): DataFrame(np.zeros((3, 3, 3)), columns=["A", "B", "C"], index=[1]) # wrong size axis labels - msg = "Shape of passed values " r"is \(2, 3\), indices " r"imply \(1, 3\)" + msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)" with pytest.raises(ValueError, match=msg): DataFrame(np.random.rand(2, 3), columns=["A", "B", "C"], index=[1]) - msg = "Shape of passed values " r"is \(2, 3\), indices " r"imply \(2, 2\)" + msg = r"Shape of passed values is \(2, 3\), indices imply \(2, 2\)" with pytest.raises(ValueError, match=msg): DataFrame(np.random.rand(2, 3), columns=["A", "B"], index=[1, 2]) diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index ea7e9b4ac490d..f9a2061aa1ff4 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -662,7 +662,7 @@ def test_fillna_invalid_method(self, float_frame): def test_fillna_invalid_value(self, float_frame): # list - msg = '"value" parameter must be a scalar or dict, but you passed' ' a "{}"' + msg = '"value" parameter must be a scalar or dict, but you passed a "{}"' with pytest.raises(TypeError, match=msg.format("list")): float_frame.fillna([1, 2]) # tuple diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index f1c23d7b245c6..1aaacfc0949c3 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -1298,7 +1298,7 @@ def test_dataframe(self, cache): tm.assert_series_equal(result, expected) # extra columns - msg = "extra keys have been passed to the datetime assemblage: " r"\[foo\]" + msg = r"extra keys have been passed to the datetime assemblage: \[foo\]" with pytest.raises(ValueError, match=msg): df2 = df.copy() df2["foo"] = 1 diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 6f20ec649b200..8b3620e8cd843 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -242,9 +242,7 @@ def test_loc_to_fail(self): with pytest.raises(KeyError, match=msg): s.loc[[-1, -2]] - msg = ( - r"\"None of \[Index\(\['4'\], dtype='object'\)\] are" r" in the \[index\]\"" - ) + msg = r"\"None of \[Index\(\['4'\], dtype='object'\)\] are in the \[index\]\"" with pytest.raises(KeyError, match=msg): s.loc[["4"]] diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 80edbd828194d..24233a0ec84b1 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -376,16 +376,14 @@ def test_to_csv_string_with_lf(self): assert f.read() == expected_noarg with tm.ensure_clean("lf_test.csv") as path: # case 2: LF as line terminator - expected_lf = b"int,str_lf\n" b"1,abc\n" b'2,"d\nef"\n' b'3,"g\nh\n\ni"\n' + expected_lf = b'int,str_lf\n1,abc\n2,"d\nef"\n3,"g\nh\n\ni"\n' df.to_csv(path, line_terminator="\n", index=False) with open(path, "rb") as f: assert f.read() == expected_lf with tm.ensure_clean("lf_test.csv") as path: # case 3: CRLF as line terminator # 'line_terminator' should not change inner element - expected_crlf = ( - b"int,str_lf\r\n" b"1,abc\r\n" b'2,"d\nef"\r\n' b'3,"g\nh\n\ni"\r\n' - ) + expected_crlf = b'int,str_lf\r\n1,abc\r\n2,"d\nef"\r\n3,"g\nh\n\ni"\r\n' df.to_csv(path, line_terminator="\r\n", index=False) with open(path, "rb") as f: assert f.read() == expected_crlf @@ -412,9 +410,7 @@ def test_to_csv_string_with_crlf(self): assert f.read() == expected_noarg with tm.ensure_clean("crlf_test.csv") as path: # case 2: LF as line terminator - expected_lf = ( - b"int,str_crlf\n" b"1,abc\n" b'2,"d\r\nef"\n' b'3,"g\r\nh\r\n\r\ni"\n' - ) + expected_lf = b'int,str_crlf\n1,abc\n2,"d\r\nef"\n3,"g\r\nh\r\n\r\ni"\n' df.to_csv(path, line_terminator="\n", index=False) with open(path, "rb") as f: assert f.read() == expected_lf diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 6489fedad03e3..6e27b79458faf 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1244,7 +1244,7 @@ def test_to_jsonl(self): # GH15096: escaped characters in columns and data df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"]) result = df.to_json(orient="records", lines=True) - expected = '{"a\\\\":"foo\\\\","b":"bar"}\n' '{"a\\\\":"foo\\"","b":"bar"}' + expected = '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}' assert result == expected tm.assert_frame_equal(pd.read_json(result, lines=True), df) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index b85032904c5ec..90da175855c34 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -56,7 +56,7 @@ def test_to_jsonl(): # GH15096: escaped characters in columns and data df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"]) result = df.to_json(orient="records", lines=True) - expected = '{"a\\\\":"foo\\\\","b":"bar"}\n' '{"a\\\\":"foo\\"","b":"bar"}' + expected = '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}' assert result == expected tm.assert_frame_equal(read_json(result, lines=True), df) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 6008f6b651c2a..dab2882499634 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -111,9 +111,9 @@ def test_encode_decimal(self): @pytest.mark.parametrize("ensure_ascii", [True, False]) def test_encode_string_conversion(self, ensure_ascii): string_input = "A string \\ / \b \f \n \r \t &" - not_html_encoded = '"A string \\\\ \\/ \\b \\f \\n ' '\\r \\t <\\/script> &"' + not_html_encoded = '"A string \\\\ \\/ \\b \\f \\n \\r \\t <\\/script> &"' html_encoded = ( - '"A string \\\\ \\/ \\b \\f \\n \\r \\t ' '\\u003c\\/script\\u003e \\u0026"' + '"A string \\\\ \\/ \\b \\f \\n \\r \\t \\u003c\\/script\\u003e \\u0026"' ) def helper(expected_output, **encode_kwargs): @@ -816,7 +816,7 @@ def test_array_numpy_labelled(self): # see gh-10837: write out the dump explicitly # so there is no dependency on iteration order - input_dumps = '[{"a": 42, "b":31}, {"a": 24, "c": 99}, ' '{"a": 2.4, "b": 78}]' + input_dumps = '[{"a": 42, "b":31}, {"a": 24, "c": 99}, {"a": 2.4, "b": 78}]' output = ujson.loads(input_dumps, numpy=True, labelled=True) expected_vals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3, 2)) assert (expected_vals == output[0]).all() diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index fe360f1346c7c..42a4a55988b0f 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1144,9 +1144,8 @@ def test_escapechar(all_parsers): StringIO(data), escapechar="\\", quotechar='"', encoding="utf-8" ) - assert result["SEARCH_TERM"][2] == ( - 'SLAGBORD, "Bergslagen", ' "IKEA:s 1700-tals serie" - ) + assert result["SEARCH_TERM"][2] == 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals serie' + tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"])) diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 75a5b7cd53ddb..e34f1010d690e 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -179,7 +179,7 @@ def test_header_not_enough_lines(self): assert_array_dicts_equal(recs, expected) def test_escapechar(self): - data = '\\"hello world"\n' '\\"hello world"\n' '\\"hello world"' + data = '\\"hello world"\n\\"hello world"\n\\"hello world"' reader = TextReader(StringIO(data), delimiter=",", header=None, escapechar="\\") result = reader.read() diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 3cd9d9cdd67d2..18d265438dee2 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -3214,7 +3214,7 @@ def test_frame_select_complex(self, setup_path): tm.assert_frame_equal(result, expected) result = store.select( - "df", "(index>df.index[3] & " 'index<=df.index[6]) | string="bar"' + "df", '(index>df.index[3] & index<=df.index[6]) | string="bar"' ) expected = df.loc[ ((df.index > df.index[3]) & (df.index <= df.index[6])) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index f4efbbeda6311..cfcd2c9f2df95 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -142,9 +142,7 @@ def test_read_non_existant(self, reader, module, error_class, fn_ext): path = os.path.join(HERE, "data", "does_not_exist." + fn_ext) msg1 = r"File (b')?.+does_not_exist\.{}'? does not exist".format(fn_ext) - msg2 = ( - r"\[Errno 2\] No such file or directory: '.+does_not_exist" r"\.{}'" - ).format(fn_ext) + msg2 = fr"\[Errno 2\] No such file or directory: '.+does_not_exist\.{fn_ext}'" msg3 = "Expected object or value" msg4 = "path_or_buf needs to be a string file path or file-like" msg5 = ( @@ -180,9 +178,7 @@ def test_read_expands_user_home_dir( monkeypatch.setattr(icom, "_expand_user", lambda x: os.path.join("foo", x)) msg1 = r"File (b')?.+does_not_exist\.{}'? does not exist".format(fn_ext) - msg2 = ( - r"\[Errno 2\] No such file or directory:" r" '.+does_not_exist\.{}'" - ).format(fn_ext) + msg2 = fr"\[Errno 2\] No such file or directory: '.+does_not_exist\.{fn_ext}'" msg3 = "Unexpected character found when decoding 'false'" msg4 = "path_or_buf needs to be a string file path or file-like" msg5 = ( diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index e477b7608ab93..94a21c06162a6 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -226,9 +226,7 @@ def test_join_on_fails_with_different_right_index(self): {"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)}, index=tm.makeCustomIndex(10, 2), ) - msg = ( - r"len\(left_on\) must equal the number of levels in the index" ' of "right"' - ) + msg = r'len\(left_on\) must equal the number of levels in the index of "right"' with pytest.raises(ValueError, match=msg): merge(df, df2, left_on="a", right_index=True) @@ -240,9 +238,7 @@ def test_join_on_fails_with_different_left_index(self): df2 = DataFrame( {"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)} ) - msg = ( - r"len\(right_on\) must equal the number of levels in the index" ' of "left"' - ) + msg = r'len\(right_on\) must equal the number of levels in the index of "left"' with pytest.raises(ValueError, match=msg): merge(df, df2, right_on="b", left_index=True) @@ -737,9 +733,7 @@ def test_join_multi_to_multi(self, join_type): ) tm.assert_frame_equal(expected, result) - msg = ( - r"len\(left_on\) must equal the number of levels in the index" ' of "right"' - ) + msg = r'len\(left_on\) must equal the number of levels in the index of "right"' with pytest.raises(ValueError, match=msg): left.join(right, on="xy", how=join_type) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 5f4e8323c7127..e191bf67c51ca 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -744,7 +744,7 @@ def test_overlapping_columns_error_message(self): # #2649, #10639 df2.columns = ["key1", "foo", "foo"] - msg = r"Data columns not unique: Index\(\['foo', 'foo'\]," r" dtype='object'\)" + msg = r"Data columns not unique: Index\(\['foo', 'foo'\], dtype='object'\)" with pytest.raises(MergeError, match=msg): merge(df, df2) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index c772038619db0..fffb9c577bf3d 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -773,7 +773,7 @@ def test_constructor_dtype_datetime64(self): dts.astype("int64") # invalid casting - msg = r"cannot astype a datetimelike from \[datetime64\[ns\]\] to" r" \[int32\]" + msg = r"cannot astype a datetimelike from \[datetime64\[ns\]\] to \[int32\]" with pytest.raises(TypeError, match=msg): dts.astype("int32") @@ -1198,7 +1198,7 @@ def test_constructor_dtype_timedelta64(self): td.astype("int64") # invalid casting - msg = r"cannot astype a timedelta from \[timedelta64\[ns\]\] to" r" \[int32\]" + msg = r"cannot astype a timedelta from \[timedelta64\[ns\]\] to \[int32\]" with pytest.raises(TypeError, match=msg): td.astype("int32") diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index ff4842791b4fd..69e34a4d97006 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -273,7 +273,7 @@ def test_astype_categorical_to_other(self): expected = s tm.assert_series_equal(s.astype("category"), expected) tm.assert_series_equal(s.astype(CategoricalDtype()), expected) - msg = r"could not convert string to float|" r"invalid literal for float\(\)" + msg = r"could not convert string to float|invalid literal for float\(\)" with pytest.raises(ValueError, match=msg): s.astype("float64") diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 45159cc28c5b7..196749a965885 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -502,11 +502,11 @@ def test_fillna_int(self): def test_fillna_raise(self): s = Series(np.random.randint(-100, 100, 50)) - msg = '"value" parameter must be a scalar or dict, but you passed a' ' "list"' + msg = '"value" parameter must be a scalar or dict, but you passed a "list"' with pytest.raises(TypeError, match=msg): s.fillna([1, 2]) - msg = '"value" parameter must be a scalar or dict, but you passed a' ' "tuple"' + msg = '"value" parameter must be a scalar or dict, but you passed a "tuple"' with pytest.raises(TypeError, match=msg): s.fillna((1, 2)) @@ -593,11 +593,11 @@ def test_fillna_categorical_raise(self): with pytest.raises(ValueError, match="fill value must be in categories"): s.fillna({1: "d", 3: "a"}) - msg = '"value" parameter must be a scalar or ' 'dict, but you passed a "list"' + msg = '"value" parameter must be a scalar or dict, but you passed a "list"' with pytest.raises(TypeError, match=msg): s.fillna(["a", "b"]) - msg = '"value" parameter must be a scalar or ' 'dict, but you passed a "tuple"' + msg = '"value" parameter must be a scalar or dict, but you passed a "tuple"' with pytest.raises(TypeError, match=msg): s.fillna(("a", "b")) diff --git a/pandas/tests/tslibs/test_parse_iso8601.py b/pandas/tests/tslibs/test_parse_iso8601.py index a6e7aee46b485..a58f227c20c7f 100644 --- a/pandas/tests/tslibs/test_parse_iso8601.py +++ b/pandas/tests/tslibs/test_parse_iso8601.py @@ -59,9 +59,7 @@ def test_parsers_iso8601_invalid(date_str): def test_parsers_iso8601_invalid_offset_invalid(): date_str = "2001-01-01 12-34-56" - msg = "Timezone hours offset out of range " 'in datetime string "{s}"'.format( - s=date_str - ) + msg = f'Timezone hours offset out of range in datetime string "{date_str}"' with pytest.raises(ValueError, match=msg): tslib._test_parse_iso8601(date_str) diff --git a/pandas/tests/util/test_validate_args_and_kwargs.py b/pandas/tests/util/test_validate_args_and_kwargs.py index 396056466bb81..6aa2088c07b5d 100644 --- a/pandas/tests/util/test_validate_args_and_kwargs.py +++ b/pandas/tests/util/test_validate_args_and_kwargs.py @@ -76,9 +76,7 @@ def test_duplicate_argument(): kwargs = {"foo": None, "bar": None} args = (None,) # duplicate value for "foo" - msg = r"{fname}\(\) got multiple values for keyword " r"argument '{arg}'".format( - fname=_fname, arg="foo" - ) + msg = fr"{_fname}\(\) got multiple values for keyword argument 'foo'" with pytest.raises(TypeError, match=msg): validate_args_and_kwargs(_fname, args, kwargs, min_fname_arg_count, compat_args) diff --git a/pandas/tests/util/test_validate_kwargs.py b/pandas/tests/util/test_validate_kwargs.py index b6241def4e5d6..54b5c6ed034a2 100644 --- a/pandas/tests/util/test_validate_kwargs.py +++ b/pandas/tests/util/test_validate_kwargs.py @@ -16,9 +16,7 @@ def test_bad_kwarg(): compat_args[bad_arg + "o"] = "bar" kwargs = {good_arg: "foo", bad_arg: "bar"} - msg = r"{fname}\(\) got an unexpected " r"keyword argument '{arg}'".format( - fname=_fname, arg=bad_arg - ) + msg = fr"{_fname}\(\) got an unexpected keyword argument '{bad_arg}'" with pytest.raises(TypeError, match=msg): validate_kwargs(_fname, kwargs, compat_args) diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 7e14ed27d5bd4..a280da6e239b2 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -191,7 +191,7 @@ def skip_if_no(package: str, min_version: Optional[str] = None) -> Callable: ) skip_if_no_ne = pytest.mark.skipif( not _USE_NUMEXPR, - reason=f"numexpr enabled->{_USE_NUMEXPR}, " f"installed->{_NUMEXPR_INSTALLED}", + reason=f"numexpr enabled->{_USE_NUMEXPR}, installed->{_NUMEXPR_INSTALLED}", ) diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index 547fe748ae941..6cc14c7804b4a 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -120,9 +120,7 @@ def _check_for_invalid_keys(fname, kwargs, compat_args): if diff: bad_arg = list(diff)[0] - raise TypeError( - (f"{fname}() got an unexpected " f"keyword argument '{bad_arg}'") - ) + raise TypeError(f"{fname}() got an unexpected keyword argument '{bad_arg}'") def validate_kwargs(fname, kwargs, compat_args): @@ -202,7 +200,7 @@ def validate_args_and_kwargs(fname, args, kwargs, max_fname_arg_count, compat_ar for key in args_dict: if key in kwargs: raise TypeError( - f"{fname}() got multiple values for keyword " f"argument '{key}'" + f"{fname}() got multiple values for keyword argument '{key}'" ) kwargs.update(args_dict) From d920bfd1f146e80d1ca8c957db57659cce920e65 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <> Date: Wed, 25 Dec 2019 15:34:35 +0200 Subject: [PATCH 3/7] Make the script to not stop on the first occurrence at each file. --- scripts/validate_string_concatenation.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/validate_string_concatenation.py b/scripts/validate_string_concatenation.py index a262bbe583380..29ae18234ebe9 100755 --- a/scripts/validate_string_concatenation.py +++ b/scripts/validate_string_concatenation.py @@ -51,12 +51,14 @@ def is_concatenated(file_path): 0 - All good. 1 - Needs to be fixed. """ + need_fix = False with open(file_path, "r") as file_name: toks = list(tokenize.generate_tokens(file_name.readline)) for i in range(len(toks) - 1): tok = toks[i] tok2 = toks[i + 1] if tok[0] == token.STRING and tok[0] == tok2[0]: + need_fix = True print( "{file_path}:{line_number}:\t{start} and {end}".format( file_path=file_path, @@ -65,8 +67,8 @@ def is_concatenated(file_path): end=tok2[1], ) ) - return 1 - return 0 + + return int(need_fix) if __name__ == "__main__": From e88f604ad049c61a6c1c5980cc2d1d0f8a4eebc9 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <> Date: Wed, 25 Dec 2019 16:26:44 +0200 Subject: [PATCH 4/7] Added support for pxi.ini files --- scripts/validate_string_concatenation.py | 42 ++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/scripts/validate_string_concatenation.py b/scripts/validate_string_concatenation.py index 29ae18234ebe9..904ec9ea03fbd 100755 --- a/scripts/validate_string_concatenation.py +++ b/scripts/validate_string_concatenation.py @@ -8,7 +8,7 @@ import token import tokenize -FILE_EXTENTIONS_TO_CHECK = [".py", ".pyx"] +FILE_EXTENTIONS_TO_CHECK = [".pxd", ".py", ".pyx", ".pyx.ini"] def main(): @@ -25,7 +25,7 @@ def main(): # Means that the given path is of a directory. for subdir, _, files in os.walk(path): for file_name in files: - ext = os.path.splitext(os.path.join(subdir, file_name))[1] + ext = full_ext(os.path.join(subdir, file_name)) if ext in FILE_EXTENTIONS_TO_CHECK: status_codes.add(is_concatenated(os.path.join(subdir, file_name))) @@ -35,6 +35,44 @@ def main(): sys.exit(0) +def full_ext(path): + """ + Get the full file extention name. + + Parameters + ---------- + path : str + File path. + + Returns + ------- + str + Full extention of a file. + + Notes + ----- + This function is needed only because of file extentions like + ` .pxi.ini` for example. + + Examples + ------- + + With one suffix: + + >>> ext = full_ext('/full/path/to/file.py') + >>> ext + .py + + Wuth two suffixes: + + >>> ext = full_ext('/full/path/to/file.pxi.ini') + >>> ext + .pxi.ini + """ + ext_list = [".{suffix}".format(suffix=suffix) for suffix in path.split(".")[1:]] + return "".join(ext_list) + + def is_concatenated(file_path): """ Checking if the file containing strings that needs to be concatenated. From 545ba18fc6355a4ca62df7efb0f5f989ef2e4784 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <> Date: Wed, 25 Dec 2019 16:30:32 +0200 Subject: [PATCH 5/7] alimcmaster1 review fixes --- ci/code_checks.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 462275d9b5bec..2b467a03966cd 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -208,7 +208,7 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Check for use of not concatenated strings' ; echo $MSG - python ./scripts/validate_string_concatenation.py pandas + python $BASE_DIR/scripts/validate_string_concatenation.py pandas RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Check that no file in the repo contains trailing whitespaces' ; echo $MSG From edb2da1d348705149f0105169dd56aa6459698fd Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <> Date: Wed, 25 Dec 2019 19:35:11 +0200 Subject: [PATCH 6/7] Fixes for jbrockmendel's review --- scripts/validate_string_concatenation.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/scripts/validate_string_concatenation.py b/scripts/validate_string_concatenation.py index 904ec9ea03fbd..a274cbb63dd04 100755 --- a/scripts/validate_string_concatenation.py +++ b/scripts/validate_string_concatenation.py @@ -1,6 +1,24 @@ #!/usr/bin/env python """ +GH #30454 + Check where there is a string that needs to be concatenated. + +This is necessary after black formating, +where for example black transforms this: + +>>> foo = ( +... "bar " +... "baz" +... ) + +into this: + +>>> foo = ("bar " "baz") + +Black is not considering this as an +issue (see https://github.com/psf/black/issues/1051), so we are checking +it here. """ import os @@ -8,7 +26,7 @@ import token import tokenize -FILE_EXTENTIONS_TO_CHECK = [".pxd", ".py", ".pyx", ".pyx.ini"] +FILE_EXTENSIONS_TO_CHECK = [".pxd", ".py", ".pyx", ".pyx.ini"] def main(): @@ -26,7 +44,7 @@ def main(): for subdir, _, files in os.walk(path): for file_name in files: ext = full_ext(os.path.join(subdir, file_name)) - if ext in FILE_EXTENTIONS_TO_CHECK: + if ext in FILE_EXTENSIONS_TO_CHECK: status_codes.add(is_concatenated(os.path.join(subdir, file_name))) if 1 in status_codes: From cd99f2fb903cdced48bad2eab1ed93589c7cf4dd Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <> Date: Wed, 25 Dec 2019 21:20:33 +0200 Subject: [PATCH 7/7] Seperating the PR --- ci/code_checks.sh | 4 - scripts/validate_string_concatenation.py | 131 ----------------------- 2 files changed, 135 deletions(-) delete mode 100755 scripts/validate_string_concatenation.py diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 2b467a03966cd..94eaab0a5b4da 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -207,10 +207,6 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then invgrep -R --include=*.{py,pyx} 'xrange' pandas RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Check for use of not concatenated strings' ; echo $MSG - python $BASE_DIR/scripts/validate_string_concatenation.py pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Check that no file in the repo contains trailing whitespaces' ; echo $MSG INVGREP_APPEND=" <- trailing whitespaces found" invgrep -RI --exclude=\*.{svg,c,cpp,html,js} --exclude-dir=env "\s$" * diff --git a/scripts/validate_string_concatenation.py b/scripts/validate_string_concatenation.py deleted file mode 100755 index a274cbb63dd04..0000000000000 --- a/scripts/validate_string_concatenation.py +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/env python -""" -GH #30454 - -Check where there is a string that needs to be concatenated. - -This is necessary after black formating, -where for example black transforms this: - ->>> foo = ( -... "bar " -... "baz" -... ) - -into this: - ->>> foo = ("bar " "baz") - -Black is not considering this as an -issue (see https://github.com/psf/black/issues/1051), so we are checking -it here. -""" - -import os -import sys -import token -import tokenize - -FILE_EXTENSIONS_TO_CHECK = [".pxd", ".py", ".pyx", ".pyx.ini"] - - -def main(): - path = sys.argv[1] - - if not os.path.exists(path): - raise ValueError("Please enter a valid path, to a file/directory.") - - if os.path.isfile(path): - # Means that the given path is of a single file. - sys.exit(is_concatenated(path)) - - status_codes = set() - # Means that the given path is of a directory. - for subdir, _, files in os.walk(path): - for file_name in files: - ext = full_ext(os.path.join(subdir, file_name)) - if ext in FILE_EXTENSIONS_TO_CHECK: - status_codes.add(is_concatenated(os.path.join(subdir, file_name))) - - if 1 in status_codes: - sys.exit(1) - - sys.exit(0) - - -def full_ext(path): - """ - Get the full file extention name. - - Parameters - ---------- - path : str - File path. - - Returns - ------- - str - Full extention of a file. - - Notes - ----- - This function is needed only because of file extentions like - ` .pxi.ini` for example. - - Examples - ------- - - With one suffix: - - >>> ext = full_ext('/full/path/to/file.py') - >>> ext - .py - - Wuth two suffixes: - - >>> ext = full_ext('/full/path/to/file.pxi.ini') - >>> ext - .pxi.ini - """ - ext_list = [".{suffix}".format(suffix=suffix) for suffix in path.split(".")[1:]] - return "".join(ext_list) - - -def is_concatenated(file_path): - """ - Checking if the file containing strings that needs to be concatenated. - - Parameters - ---------- - file_path : str - File path pointing to a single file. - - Returns - ------- - int - Status code representing if the file needs a fix. - 0 - All good. - 1 - Needs to be fixed. - """ - need_fix = False - with open(file_path, "r") as file_name: - toks = list(tokenize.generate_tokens(file_name.readline)) - for i in range(len(toks) - 1): - tok = toks[i] - tok2 = toks[i + 1] - if tok[0] == token.STRING and tok[0] == tok2[0]: - need_fix = True - print( - "{file_path}:{line_number}:\t{start} and {end}".format( - file_path=file_path, - line_number=tok[2][0], - start=tok[1], - end=tok2[1], - ) - ) - - return int(need_fix) - - -if __name__ == "__main__": - main()