23
23
BytesIO ,
24
24
StringIO ,
25
25
)
26
+ from itertools import combinations
26
27
import operator
27
28
import pickle
28
29
import re
@@ -1933,13 +1934,18 @@ def test_str_fullmatch(pat, case, na, exp):
1933
1934
1934
1935
1935
1936
@pytest .mark .parametrize (
1936
- "sub, start, end, exp, exp_typ" ,
1937
- [["ab" , 0 , None , [0 , None ], pa .int32 ()], ["bc" , 1 , 3 , [1 , None ], pa .int64 ()]],
1937
+ "sub, start, end, exp, exp_type" ,
1938
+ [
1939
+ ["ab" , 0 , None , [0 , None ], pa .int32 ()],
1940
+ ["bc" , 1 , 3 , [1 , None ], pa .int64 ()],
1941
+ ["ab" , 1 , 3 , [- 1 , None ], pa .int64 ()],
1942
+ ["ab" , - 3 , - 3 , [- 1 , None ], pa .int64 ()],
1943
+ ],
1938
1944
)
1939
- def test_str_find (sub , start , end , exp , exp_typ ):
1945
+ def test_str_find (sub , start , end , exp , exp_type ):
1940
1946
ser = pd .Series (["abc" , None ], dtype = ArrowDtype (pa .string ()))
1941
1947
result = ser .str .find (sub , start = start , end = end )
1942
- expected = pd .Series (exp , dtype = ArrowDtype (exp_typ ))
1948
+ expected = pd .Series (exp , dtype = ArrowDtype (exp_type ))
1943
1949
tm .assert_series_equal (result , expected )
1944
1950
1945
1951
@@ -1951,10 +1957,70 @@ def test_str_find_negative_start():
1951
1957
tm .assert_series_equal (result , expected )
1952
1958
1953
1959
1954
- def test_str_find_notimplemented ():
1960
+ def test_str_find_no_end ():
1955
1961
ser = pd .Series (["abc" , None ], dtype = ArrowDtype (pa .string ()))
1956
- with pytest .raises (NotImplementedError , match = "find not implemented" ):
1957
- ser .str .find ("ab" , start = 1 )
1962
+ if pa_version_under13p0 :
1963
+ # https://github.com/apache/arrow/issues/36311
1964
+ with pytest .raises (pa .lib .ArrowInvalid , match = "Negative buffer resize" ):
1965
+ ser .str .find ("ab" , start = 1 )
1966
+ else :
1967
+ result = ser .str .find ("ab" , start = 1 )
1968
+ expected = pd .Series ([- 1 , None ], dtype = "int64[pyarrow]" )
1969
+ tm .assert_series_equal (result , expected )
1970
+
1971
+
1972
+ def test_str_find_negative_start_negative_end ():
1973
+ # GH 56791
1974
+ ser = pd .Series (["abcdefg" , None ], dtype = ArrowDtype (pa .string ()))
1975
+ result = ser .str .find (sub = "d" , start = - 6 , end = - 3 )
1976
+ expected = pd .Series ([3 , None ], dtype = ArrowDtype (pa .int64 ()))
1977
+ tm .assert_series_equal (result , expected )
1978
+
1979
+
1980
+ def test_str_find_large_start ():
1981
+ # GH 56791
1982
+ ser = pd .Series (["abcdefg" , None ], dtype = ArrowDtype (pa .string ()))
1983
+ if pa_version_under13p0 :
1984
+ # https://github.com/apache/arrow/issues/36311
1985
+ with pytest .raises (pa .lib .ArrowInvalid , match = "Negative buffer resize" ):
1986
+ ser .str .find (sub = "d" , start = 16 )
1987
+ else :
1988
+ result = ser .str .find (sub = "d" , start = 16 )
1989
+ expected = pd .Series ([- 1 , None ], dtype = ArrowDtype (pa .int64 ()))
1990
+ tm .assert_series_equal (result , expected )
1991
+
1992
+
1993
+ @pytest .mark .skipif (
1994
+ pa_version_under13p0 , reason = "https://github.com/apache/arrow/issues/36311"
1995
+ )
1996
+ @pytest .mark .parametrize ("start" , list (range (- 15 , 15 )) + [None ])
1997
+ @pytest .mark .parametrize ("end" , list (range (- 15 , 15 )) + [None ])
1998
+ @pytest .mark .parametrize (
1999
+ "sub" ,
2000
+ ["abcaadef" [x :y ] for x , y in combinations (range (len ("abcaadef" ) + 1 ), r = 2 )]
2001
+ + [
2002
+ "" ,
2003
+ "az" ,
2004
+ "abce" ,
2005
+ ],
2006
+ )
2007
+ def test_str_find_e2e (start , end , sub ):
2008
+ s = pd .Series (
2009
+ ["abcaadef" , "abc" , "abcdeddefgj8292" , "ab" , "a" , "" ],
2010
+ dtype = ArrowDtype (pa .string ()),
2011
+ )
2012
+ object_series = s .astype (pd .StringDtype ())
2013
+ result = s .str .find (sub , start , end )
2014
+ expected = object_series .str .find (sub , start , end ).astype (result .dtype )
2015
+ tm .assert_series_equal (result , expected )
2016
+
2017
+
2018
+ def test_str_find_negative_start_negative_end_no_match ():
2019
+ # GH 56791
2020
+ ser = pd .Series (["abcdefg" , None ], dtype = ArrowDtype (pa .string ()))
2021
+ result = ser .str .find (sub = "d" , start = - 3 , end = - 6 )
2022
+ expected = pd .Series ([- 1 , None ], dtype = ArrowDtype (pa .int64 ()))
2023
+ tm .assert_series_equal (result , expected )
1958
2024
1959
2025
1960
2026
@pytest .mark .parametrize (
0 commit comments