13
13
14
14
from pandas import DataFrame , Series , Index , MultiIndex , DatetimeIndex
15
15
from pandas .compat import (
16
- StringIO , BytesIO , PY3 , range , long , lrange , lmap , u , map , StringIO
16
+ StringIO , BytesIO , PY3 , range , long , lrange , lmap , u
17
17
)
18
18
from pandas .io .common import urlopen , URLError
19
19
import pandas .io .parsers as parsers
20
20
from pandas .io .parsers import (read_csv , read_table , read_fwf ,
21
21
TextFileReader , TextParser )
22
- from pandas .util .testing import (assert_almost_equal ,
23
- assert_series_equal ,
24
- makeCustomDataframe as mkdf ,
25
- network ,
26
- ensure_clean )
22
+
27
23
import pandas .util .testing as tm
28
24
import pandas as pd
29
25
@@ -67,6 +63,35 @@ def setUp(self):
67
63
self .csv2 = os .path .join (self .dirpath , 'test2.csv' )
68
64
self .xls1 = os .path .join (self .dirpath , 'test.xls' )
69
65
66
+ def test_multi_character_decimal_marker (self ):
67
+ data = """A|B|C
68
+ 1|2,334|5
69
+ 10|13|10.
70
+ """
71
+ self .assertRaises (ValueError , read_csv , StringIO (data ), decimal = ',,' )
72
+
73
+ def test_empty_decimal_marker (self ):
74
+ data = """A|B|C
75
+ 1|2,334|5
76
+ 10|13|10.
77
+ """
78
+ self .assertRaises (ValueError , read_csv , StringIO (data ), decimal = '' )
79
+
80
+ def test_empty_thousands_marker (self ):
81
+ data = """A|B|C
82
+ 1|2,334|5
83
+ 10|13|10.
84
+ """
85
+ self .assertRaises (ValueError , read_csv , StringIO (data ), thousands = '' )
86
+
87
+
88
+ def test_multi_character_decimal_marker (self ):
89
+ data = """A|B|C
90
+ 1|2,334|5
91
+ 10|13|10.
92
+ """
93
+ self .assertRaises (ValueError , read_csv , StringIO (data ), thousands = ',,' )
94
+
70
95
def test_empty_string (self ):
71
96
data = """\
72
97
One,Two,Three
@@ -164,14 +189,48 @@ def test_1000_sep(self):
164
189
1|2,334|5
165
190
10|13|10.
166
191
"""
167
- expected = [[1 , 2334. , 5 ],
168
- [10 , 13 , 10 ]]
192
+ expected = DataFrame ({
193
+ 'A' : [1 , 10 ],
194
+ 'B' : [2334 , 13 ],
195
+ 'C' : [5 , 10. ]
196
+ })
169
197
170
198
df = self .read_csv (StringIO (data ), sep = '|' , thousands = ',' )
171
- assert_almost_equal (df . values , expected )
199
+ tm . assert_frame_equal (df , expected )
172
200
173
201
df = self .read_table (StringIO (data ), sep = '|' , thousands = ',' )
174
- assert_almost_equal (df .values , expected )
202
+ tm .assert_frame_equal (df , expected )
203
+
204
+ def test_1000_sep_with_decimal (self ):
205
+ data = """A|B|C
206
+ 1|2,334.01|5
207
+ 10|13|10.
208
+ """
209
+ expected = DataFrame ({
210
+ 'A' : [1 , 10 ],
211
+ 'B' : [2334.01 , 13 ],
212
+ 'C' : [5 , 10. ]
213
+ })
214
+
215
+ tm .assert_equal (expected .A .dtype , 'int64' )
216
+ tm .assert_equal (expected .B .dtype , 'float' )
217
+ tm .assert_equal (expected .C .dtype , 'float' )
218
+
219
+ df = self .read_csv (StringIO (data ), sep = '|' , thousands = ',' , decimal = '.' )
220
+ tm .assert_frame_equal (df , expected )
221
+
222
+ df = self .read_table (StringIO (data ), sep = '|' , thousands = ',' , decimal = '.' )
223
+ tm .assert_frame_equal (df , expected )
224
+
225
+ data_with_odd_sep = """A|B|C
226
+ 1|2.334,01|5
227
+ 10|13|10,
228
+ """
229
+ df = self .read_csv (StringIO (data_with_odd_sep ), sep = '|' , thousands = '.' , decimal = ',' )
230
+ tm .assert_frame_equal (df , expected )
231
+
232
+ df = self .read_table (StringIO (data_with_odd_sep ), sep = '|' , thousands = '.' , decimal = ',' )
233
+ tm .assert_frame_equal (df , expected )
175
234
176
235
def test_squeeze (self ):
177
236
data = """\
@@ -183,7 +242,7 @@ def test_squeeze(self):
183
242
result = self .read_table (StringIO (data ), sep = ',' , index_col = 0 ,
184
243
header = None , squeeze = True )
185
244
tm .assert_isinstance (result , Series )
186
- assert_series_equal (result , expected )
245
+ tm . assert_series_equal (result , expected )
187
246
188
247
def test_inf_parsing (self ):
189
248
data = """\
@@ -201,9 +260,9 @@ def test_inf_parsing(self):
201
260
inf = float ('inf' )
202
261
expected = Series ([inf , - inf ] * 5 )
203
262
df = read_csv (StringIO (data ), index_col = 0 )
204
- assert_almost_equal (df ['A' ].values , expected .values )
263
+ tm . assert_almost_equal (df ['A' ].values , expected .values )
205
264
df = read_csv (StringIO (data ), index_col = 0 , na_filter = False )
206
- assert_almost_equal (df ['A' ].values , expected .values )
265
+ tm . assert_almost_equal (df ['A' ].values , expected .values )
207
266
208
267
def test_multiple_date_col (self ):
209
268
# Can use multiple date parsers
@@ -524,7 +583,7 @@ def test_passing_dtype(self):
524
583
525
584
df = DataFrame (np .random .rand (5 ,2 ),columns = list ('AB' ),index = ['1A' ,'1B' ,'1C' ,'1D' ,'1E' ])
526
585
527
- with ensure_clean ('__passing_str_as_dtype__.csv' ) as path :
586
+ with tm . ensure_clean ('__passing_str_as_dtype__.csv' ) as path :
528
587
df .to_csv (path )
529
588
530
589
# GH 3795
@@ -566,7 +625,7 @@ def test_quoting(self):
566
625
567
626
def test_non_string_na_values (self ):
568
627
# GH3611, na_values that are not a string are an issue
569
- with ensure_clean ('__non_string_na_values__.csv' ) as path :
628
+ with tm . ensure_clean ('__non_string_na_values__.csv' ) as path :
570
629
df = DataFrame ({'A' : [- 999 , 2 , 3 ], 'B' : [1.2 , - 999 , 4.5 ]})
571
630
df .to_csv (path , sep = ' ' , index = False )
572
631
result1 = read_csv (path , sep = ' ' , header = 0 , na_values = ['-999.0' ,'-999' ])
@@ -617,15 +676,15 @@ def test_custom_na_values(self):
617
676
[7 , 8 , nan ]]
618
677
619
678
df = self .read_csv (StringIO (data ), na_values = ['baz' ], skiprows = [1 ])
620
- assert_almost_equal (df .values , expected )
679
+ tm . assert_almost_equal (df .values , expected )
621
680
622
681
df2 = self .read_table (StringIO (data ), sep = ',' , na_values = ['baz' ],
623
682
skiprows = [1 ])
624
- assert_almost_equal (df2 .values , expected )
683
+ tm . assert_almost_equal (df2 .values , expected )
625
684
626
685
df3 = self .read_table (StringIO (data ), sep = ',' , na_values = 'baz' ,
627
686
skiprows = [1 ])
628
- assert_almost_equal (df3 .values , expected )
687
+ tm . assert_almost_equal (df3 .values , expected )
629
688
630
689
def test_nat_parse (self ):
631
690
@@ -635,7 +694,7 @@ def test_nat_parse(self):
635
694
'B' : pd .Timestamp ('20010101' ) }))
636
695
df .iloc [3 :6 ,:] = np .nan
637
696
638
- with ensure_clean ('__nat_parse_.csv' ) as path :
697
+ with tm . ensure_clean ('__nat_parse_.csv' ) as path :
639
698
df .to_csv (path )
640
699
result = read_csv (path ,index_col = 0 ,parse_dates = ['B' ])
641
700
tm .assert_frame_equal (result ,df )
@@ -686,7 +745,7 @@ def test_detect_string_na(self):
686
745
[nan , nan ]]
687
746
688
747
df = self .read_csv (StringIO (data ))
689
- assert_almost_equal (df .values , expected )
748
+ tm . assert_almost_equal (df .values , expected )
690
749
691
750
def test_unnamed_columns (self ):
692
751
data = """A,B,C,,
@@ -698,7 +757,7 @@ def test_unnamed_columns(self):
698
757
[6 , 7 , 8 , 9 , 10 ],
699
758
[11 , 12 , 13 , 14 , 15 ]]
700
759
df = self .read_table (StringIO (data ), sep = ',' )
701
- assert_almost_equal (df .values , expected )
760
+ tm . assert_almost_equal (df .values , expected )
702
761
self .assert_ (np .array_equal (df .columns ,
703
762
['A' , 'B' , 'C' , 'Unnamed: 3' ,
704
763
'Unnamed: 4' ]))
@@ -849,8 +908,8 @@ def test_no_header(self):
849
908
expected = [[1 , 2 , 3 , 4 , 5. ],
850
909
[6 , 7 , 8 , 9 , 10 ],
851
910
[11 , 12 , 13 , 14 , 15 ]]
852
- assert_almost_equal (df .values , expected )
853
- assert_almost_equal (df .values , df2 .values )
911
+ tm . assert_almost_equal (df .values , expected )
912
+ tm . assert_almost_equal (df .values , df2 .values )
854
913
855
914
self .assert_ (np .array_equal (df_pref .columns ,
856
915
['X0' , 'X1' , 'X2' , 'X3' , 'X4' ]))
@@ -1113,7 +1172,7 @@ def test_header_not_first_line(self):
1113
1172
tm .assert_frame_equal (df , expected )
1114
1173
1115
1174
def test_header_multi_index (self ):
1116
- expected = mkdf (5 ,3 ,r_idx_nlevels = 2 ,c_idx_nlevels = 4 )
1175
+ expected = tm . makeCustomDataframe (5 ,3 ,r_idx_nlevels = 2 ,c_idx_nlevels = 4 )
1117
1176
1118
1177
data = """\
1119
1178
C0,,C_l0_g0,C_l0_g1,C_l0_g2
@@ -1413,7 +1472,7 @@ def test_na_value_dict(self):
1413
1472
tm .assert_frame_equal (df , xp )
1414
1473
1415
1474
@slow
1416
- @network
1475
+ @tm . network
1417
1476
def test_url (self ):
1418
1477
try :
1419
1478
# HTTP(S)
@@ -1428,7 +1487,7 @@ def test_url(self):
1428
1487
1429
1488
except URLError :
1430
1489
try :
1431
- with closing (urlopen ('http://www.google.com' )) as resp :
1490
+ with tm . closing (urlopen ('http://www.google.com' )) as resp :
1432
1491
pass
1433
1492
except URLError :
1434
1493
raise nose .SkipTest
@@ -1533,11 +1592,11 @@ def test_comment(self):
1533
1592
expected = [[1. , 2. , 4. ],
1534
1593
[5. , np .nan , 10. ]]
1535
1594
df = self .read_csv (StringIO (data ), comment = '#' )
1536
- assert_almost_equal (df .values , expected )
1595
+ tm . assert_almost_equal (df .values , expected )
1537
1596
1538
1597
df = self .read_table (StringIO (data ), sep = ',' , comment = '#' ,
1539
1598
na_values = ['NaN' ])
1540
- assert_almost_equal (df .values , expected )
1599
+ tm . assert_almost_equal (df .values , expected )
1541
1600
1542
1601
def test_bool_na_values (self ):
1543
1602
data = """A,B,C
@@ -1595,7 +1654,7 @@ def test_utf16_bom_skiprows(self):
1595
1654
1596
1655
path = '__%s__.csv' % tm .rands (10 )
1597
1656
1598
- with ensure_clean (path ) as path :
1657
+ with tm . ensure_clean (path ) as path :
1599
1658
for sep , dat in [('\t ' , data ), (',' , data2 )]:
1600
1659
for enc in ['utf-16' , 'utf-16le' , 'utf-16be' ]:
1601
1660
bytes = dat .encode (enc )
@@ -1860,7 +1919,25 @@ def test_1000_fwf(self):
1860
1919
[10 , 13 , 10 ]]
1861
1920
df = read_fwf (StringIO (data ), colspecs = [(0 , 3 ), (3 , 11 ), (12 , 16 )],
1862
1921
thousands = ',' )
1863
- assert_almost_equal (df .values , expected )
1922
+ tm .assert_almost_equal (df .values , expected )
1923
+
1924
+ def test_1000_sep_with_decimal (self ):
1925
+ data = """A|B|C
1926
+ 1|2,334.01|5
1927
+ 10|13|10.
1928
+ """
1929
+
1930
+ expected = DataFrame ({
1931
+ 'A' : [1 , 10 ],
1932
+ 'B' : [2334.01 , 13 ],
1933
+ 'C' : [5 , 10. ]
1934
+ })
1935
+
1936
+ df = self .read_csv (StringIO (data ), sep = '|' , thousands = ',' )
1937
+ tm .assert_frame_equal (df , expected )
1938
+
1939
+ df = self .read_table (StringIO (data ), sep = '|' , thousands = ',' )
1940
+ tm .assert_frame_equal (df , expected )
1864
1941
1865
1942
def test_comment_fwf (self ):
1866
1943
data = """
@@ -1871,7 +1948,7 @@ def test_comment_fwf(self):
1871
1948
[5 , np .nan , 10. ]]
1872
1949
df = read_fwf (StringIO (data ), colspecs = [(0 , 3 ), (4 , 9 ), (9 , 25 )],
1873
1950
comment = '#' )
1874
- assert_almost_equal (df .values , expected )
1951
+ tm . assert_almost_equal (df .values , expected )
1875
1952
1876
1953
def test_fwf (self ):
1877
1954
data_expected = """\
@@ -1993,7 +2070,7 @@ def test_iteration_open_handle(self):
1993
2070
if PY3 :
1994
2071
raise nose .SkipTest
1995
2072
1996
- with ensure_clean () as path :
2073
+ with tm . ensure_clean () as path :
1997
2074
with open (path , 'wb' ) as f :
1998
2075
f .write ('AAA\n BBB\n CCC\n DDD\n EEE\n FFF\n GGG' )
1999
2076
@@ -2212,7 +2289,7 @@ def test_decompression(self):
2212
2289
data = open (self .csv1 , 'rb' ).read ()
2213
2290
expected = self .read_csv (self .csv1 )
2214
2291
2215
- with ensure_clean () as path :
2292
+ with tm . ensure_clean () as path :
2216
2293
tmp = gzip .GzipFile (path , mode = 'wb' )
2217
2294
tmp .write (data )
2218
2295
tmp .close ()
@@ -2223,7 +2300,7 @@ def test_decompression(self):
2223
2300
result = self .read_csv (open (path , 'rb' ), compression = 'gzip' )
2224
2301
tm .assert_frame_equal (result , expected )
2225
2302
2226
- with ensure_clean () as path :
2303
+ with tm . ensure_clean () as path :
2227
2304
tmp = bz2 .BZ2File (path , mode = 'wb' )
2228
2305
tmp .write (data )
2229
2306
tmp .close ()
@@ -2248,15 +2325,15 @@ def test_decompression_regex_sep(self):
2248
2325
data = data .replace (b',' , b'::' )
2249
2326
expected = self .read_csv (self .csv1 )
2250
2327
2251
- with ensure_clean () as path :
2328
+ with tm . ensure_clean () as path :
2252
2329
tmp = gzip .GzipFile (path , mode = 'wb' )
2253
2330
tmp .write (data )
2254
2331
tmp .close ()
2255
2332
2256
2333
result = self .read_csv (path , sep = '::' , compression = 'gzip' )
2257
2334
tm .assert_frame_equal (result , expected )
2258
2335
2259
- with ensure_clean () as path :
2336
+ with tm . ensure_clean () as path :
2260
2337
tmp = bz2 .BZ2File (path , mode = 'wb' )
2261
2338
tmp .write (data )
2262
2339
tmp .close ()
@@ -2470,7 +2547,7 @@ def test_convert_sql_column_decimals(self):
2470
2547
2471
2548
def assert_same_values_and_dtype (res , exp ):
2472
2549
assert (res .dtype == exp .dtype )
2473
- assert_almost_equal (res , exp )
2550
+ tm . assert_almost_equal (res , exp )
2474
2551
2475
2552
2476
2553
if __name__ == '__main__' :
0 commit comments