Skip to content

Commit d536ff6

Browse files
committed
Merge pull request #4598 from guyrt/csv-import-arg-conflict
csv_import: Thousands separator works in floating point numbers
2 parents 223d0a5 + 93ea765 commit d536ff6

File tree

7 files changed

+142
-54
lines changed

7 files changed

+142
-54
lines changed

doc/source/io.rst

+8-6
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,11 @@ They can take a number of arguments:
118118
date_converters.py
119119
- ``dayfirst``: if True then uses the DD/MM international/European date format
120120
(This is False by default)
121-
- ``thousands``: sepcifies the thousands separator. If not None, then parser
122-
will try to look for it in the output and parse relevant data to integers.
123-
Because it has to essentially scan through the data again, this causes a
121+
- ``thousands``: specifies the thousands separator. If not None, this character will
122+
be stripped from numeric dtypes. However, if it is the first character in a field,
123+
that column will be imported as a string. In the PythonParser, if not None,
124+
then parser will try to look for it in the output and parse relevant data to numeric
125+
dtypes. Because it has to essentially scan through the data again, this causes a
124126
significant performance hit so only use if necessary.
125127
- ``lineterminator`` : string (length 1), default ``None``, Character to break file into lines. Only valid with C parser
126128
- ``quotechar`` : string, The character to used to denote the start and end of a quoted item.
@@ -506,8 +508,8 @@ DD/MM/YYYY instead. For convenience, a ``dayfirst`` keyword is provided:
506508

507509
Thousand Separators
508510
~~~~~~~~~~~~~~~~~~~
509-
For large integers that have been written with a thousands separator, you can
510-
set the ``thousands`` keyword to ``True`` so that integers will be parsed
511+
For large numbers that have been written with a thousands separator, you can
512+
set the ``thousands`` keyword to a string of length 1 so that integers will be parsed
511513
correctly:
512514

513515
.. ipython:: python
@@ -521,7 +523,7 @@ correctly:
521523
with open('tmp.csv', 'w') as fh:
522524
fh.write(data)
523525
524-
By default, integers with a thousands separator will be parsed as strings
526+
By default, numbers with a thousands separator will be parsed as strings
525527

526528
.. ipython:: python
527529

doc/source/release.rst

+2
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,8 @@ See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`
273273
- Fix selection with ``ix/loc`` and non_unique selectors (:issue:`4619`)
274274
- Fix assignment with iloc/loc involving a dtype change in an existing column (:issue:`4312`)
275275
have internal setitem_with_indexer in core/indexing to use Block.setitem
276+
- Fixed bug where thousands operator was not handled correctly for floating point numbers
277+
in csv_import (:issue:`4322`)
276278

277279
pandas 0.12
278280
===========

doc/source/v0.13.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,9 @@ Bug Fixes
286286

287287
- Suppressed DeprecationWarning associated with internal calls issued by repr() (:issue:`4391`)
288288

289+
- Fixed bug where thousands operator was not handled correctly for floating point numbers
290+
in csv_import (:issue:`4322`)
291+
289292
See the :ref:`full release notes
290293
<release>` or issue tracker
291294
on GitHub for a complete list.

pandas/io/tests/test_parsers.py

+114-37
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,13 @@
1313

1414
from pandas import DataFrame, Series, Index, MultiIndex, DatetimeIndex
1515
from pandas.compat import(
16-
StringIO, BytesIO, PY3, range, long, lrange, lmap, u, map, StringIO
16+
StringIO, BytesIO, PY3, range, long, lrange, lmap, u
1717
)
1818
from pandas.io.common import urlopen, URLError
1919
import pandas.io.parsers as parsers
2020
from pandas.io.parsers import (read_csv, read_table, read_fwf,
2121
TextFileReader, TextParser)
22-
from pandas.util.testing import (assert_almost_equal,
23-
assert_series_equal,
24-
makeCustomDataframe as mkdf,
25-
network,
26-
ensure_clean)
22+
2723
import pandas.util.testing as tm
2824
import pandas as pd
2925

@@ -67,6 +63,35 @@ def setUp(self):
6763
self.csv2 = os.path.join(self.dirpath, 'test2.csv')
6864
self.xls1 = os.path.join(self.dirpath, 'test.xls')
6965

66+
def test_multi_character_decimal_marker(self):
67+
data = """A|B|C
68+
1|2,334|5
69+
10|13|10.
70+
"""
71+
self.assertRaises(ValueError, read_csv, StringIO(data), decimal=',,')
72+
73+
def test_empty_decimal_marker(self):
74+
data = """A|B|C
75+
1|2,334|5
76+
10|13|10.
77+
"""
78+
self.assertRaises(ValueError, read_csv, StringIO(data), decimal='')
79+
80+
def test_empty_thousands_marker(self):
81+
data = """A|B|C
82+
1|2,334|5
83+
10|13|10.
84+
"""
85+
self.assertRaises(ValueError, read_csv, StringIO(data), thousands='')
86+
87+
88+
def test_multi_character_decimal_marker(self):
89+
data = """A|B|C
90+
1|2,334|5
91+
10|13|10.
92+
"""
93+
self.assertRaises(ValueError, read_csv, StringIO(data), thousands=',,')
94+
7095
def test_empty_string(self):
7196
data = """\
7297
One,Two,Three
@@ -164,14 +189,48 @@ def test_1000_sep(self):
164189
1|2,334|5
165190
10|13|10.
166191
"""
167-
expected = [[1, 2334., 5],
168-
[10, 13, 10]]
192+
expected = DataFrame({
193+
'A': [1, 10],
194+
'B': [2334, 13],
195+
'C': [5, 10.]
196+
})
169197

170198
df = self.read_csv(StringIO(data), sep='|', thousands=',')
171-
assert_almost_equal(df.values, expected)
199+
tm.assert_frame_equal(df, expected)
172200

173201
df = self.read_table(StringIO(data), sep='|', thousands=',')
174-
assert_almost_equal(df.values, expected)
202+
tm.assert_frame_equal(df, expected)
203+
204+
def test_1000_sep_with_decimal(self):
205+
data = """A|B|C
206+
1|2,334.01|5
207+
10|13|10.
208+
"""
209+
expected = DataFrame({
210+
'A': [1, 10],
211+
'B': [2334.01, 13],
212+
'C': [5, 10.]
213+
})
214+
215+
tm.assert_equal(expected.A.dtype, 'int64')
216+
tm.assert_equal(expected.B.dtype, 'float')
217+
tm.assert_equal(expected.C.dtype, 'float')
218+
219+
df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
220+
tm.assert_frame_equal(df, expected)
221+
222+
df = self.read_table(StringIO(data), sep='|', thousands=',', decimal='.')
223+
tm.assert_frame_equal(df, expected)
224+
225+
data_with_odd_sep = """A|B|C
226+
1|2.334,01|5
227+
10|13|10,
228+
"""
229+
df = self.read_csv(StringIO(data_with_odd_sep), sep='|', thousands='.', decimal=',')
230+
tm.assert_frame_equal(df, expected)
231+
232+
df = self.read_table(StringIO(data_with_odd_sep), sep='|', thousands='.', decimal=',')
233+
tm.assert_frame_equal(df, expected)
175234

176235
def test_squeeze(self):
177236
data = """\
@@ -183,7 +242,7 @@ def test_squeeze(self):
183242
result = self.read_table(StringIO(data), sep=',', index_col=0,
184243
header=None, squeeze=True)
185244
tm.assert_isinstance(result, Series)
186-
assert_series_equal(result, expected)
245+
tm.assert_series_equal(result, expected)
187246

188247
def test_inf_parsing(self):
189248
data = """\
@@ -201,9 +260,9 @@ def test_inf_parsing(self):
201260
inf = float('inf')
202261
expected = Series([inf, -inf] * 5)
203262
df = read_csv(StringIO(data), index_col=0)
204-
assert_almost_equal(df['A'].values, expected.values)
263+
tm.assert_almost_equal(df['A'].values, expected.values)
205264
df = read_csv(StringIO(data), index_col=0, na_filter=False)
206-
assert_almost_equal(df['A'].values, expected.values)
265+
tm.assert_almost_equal(df['A'].values, expected.values)
207266

208267
def test_multiple_date_col(self):
209268
# Can use multiple date parsers
@@ -524,7 +583,7 @@ def test_passing_dtype(self):
524583

525584
df = DataFrame(np.random.rand(5,2),columns=list('AB'),index=['1A','1B','1C','1D','1E'])
526585

527-
with ensure_clean('__passing_str_as_dtype__.csv') as path:
586+
with tm.ensure_clean('__passing_str_as_dtype__.csv') as path:
528587
df.to_csv(path)
529588

530589
# GH 3795
@@ -566,7 +625,7 @@ def test_quoting(self):
566625

567626
def test_non_string_na_values(self):
568627
# GH3611, na_values that are not a string are an issue
569-
with ensure_clean('__non_string_na_values__.csv') as path:
628+
with tm.ensure_clean('__non_string_na_values__.csv') as path:
570629
df = DataFrame({'A' : [-999, 2, 3], 'B' : [1.2, -999, 4.5]})
571630
df.to_csv(path, sep=' ', index=False)
572631
result1 = read_csv(path, sep= ' ', header=0, na_values=['-999.0','-999'])
@@ -617,15 +676,15 @@ def test_custom_na_values(self):
617676
[7, 8, nan]]
618677

619678
df = self.read_csv(StringIO(data), na_values=['baz'], skiprows=[1])
620-
assert_almost_equal(df.values, expected)
679+
tm.assert_almost_equal(df.values, expected)
621680

622681
df2 = self.read_table(StringIO(data), sep=',', na_values=['baz'],
623682
skiprows=[1])
624-
assert_almost_equal(df2.values, expected)
683+
tm.assert_almost_equal(df2.values, expected)
625684

626685
df3 = self.read_table(StringIO(data), sep=',', na_values='baz',
627686
skiprows=[1])
628-
assert_almost_equal(df3.values, expected)
687+
tm.assert_almost_equal(df3.values, expected)
629688

630689
def test_nat_parse(self):
631690

@@ -635,7 +694,7 @@ def test_nat_parse(self):
635694
'B' : pd.Timestamp('20010101') }))
636695
df.iloc[3:6,:] = np.nan
637696

638-
with ensure_clean('__nat_parse_.csv') as path:
697+
with tm.ensure_clean('__nat_parse_.csv') as path:
639698
df.to_csv(path)
640699
result = read_csv(path,index_col=0,parse_dates=['B'])
641700
tm.assert_frame_equal(result,df)
@@ -686,7 +745,7 @@ def test_detect_string_na(self):
686745
[nan, nan]]
687746

688747
df = self.read_csv(StringIO(data))
689-
assert_almost_equal(df.values, expected)
748+
tm.assert_almost_equal(df.values, expected)
690749

691750
def test_unnamed_columns(self):
692751
data = """A,B,C,,
@@ -698,7 +757,7 @@ def test_unnamed_columns(self):
698757
[6, 7, 8, 9, 10],
699758
[11, 12, 13, 14, 15]]
700759
df = self.read_table(StringIO(data), sep=',')
701-
assert_almost_equal(df.values, expected)
760+
tm.assert_almost_equal(df.values, expected)
702761
self.assert_(np.array_equal(df.columns,
703762
['A', 'B', 'C', 'Unnamed: 3',
704763
'Unnamed: 4']))
@@ -849,8 +908,8 @@ def test_no_header(self):
849908
expected = [[1, 2, 3, 4, 5.],
850909
[6, 7, 8, 9, 10],
851910
[11, 12, 13, 14, 15]]
852-
assert_almost_equal(df.values, expected)
853-
assert_almost_equal(df.values, df2.values)
911+
tm.assert_almost_equal(df.values, expected)
912+
tm.assert_almost_equal(df.values, df2.values)
854913

855914
self.assert_(np.array_equal(df_pref.columns,
856915
['X0', 'X1', 'X2', 'X3', 'X4']))
@@ -1113,7 +1172,7 @@ def test_header_not_first_line(self):
11131172
tm.assert_frame_equal(df, expected)
11141173

11151174
def test_header_multi_index(self):
1116-
expected = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4)
1175+
expected = tm.makeCustomDataframe(5,3,r_idx_nlevels=2,c_idx_nlevels=4)
11171176

11181177
data = """\
11191178
C0,,C_l0_g0,C_l0_g1,C_l0_g2
@@ -1413,7 +1472,7 @@ def test_na_value_dict(self):
14131472
tm.assert_frame_equal(df, xp)
14141473

14151474
@slow
1416-
@network
1475+
@tm.network
14171476
def test_url(self):
14181477
try:
14191478
# HTTP(S)
@@ -1428,7 +1487,7 @@ def test_url(self):
14281487

14291488
except URLError:
14301489
try:
1431-
with closing(urlopen('http://www.google.com')) as resp:
1490+
with tm.closing(urlopen('http://www.google.com')) as resp:
14321491
pass
14331492
except URLError:
14341493
raise nose.SkipTest
@@ -1533,11 +1592,11 @@ def test_comment(self):
15331592
expected = [[1., 2., 4.],
15341593
[5., np.nan, 10.]]
15351594
df = self.read_csv(StringIO(data), comment='#')
1536-
assert_almost_equal(df.values, expected)
1595+
tm.assert_almost_equal(df.values, expected)
15371596

15381597
df = self.read_table(StringIO(data), sep=',', comment='#',
15391598
na_values=['NaN'])
1540-
assert_almost_equal(df.values, expected)
1599+
tm.assert_almost_equal(df.values, expected)
15411600

15421601
def test_bool_na_values(self):
15431602
data = """A,B,C
@@ -1595,7 +1654,7 @@ def test_utf16_bom_skiprows(self):
15951654

15961655
path = '__%s__.csv' % tm.rands(10)
15971656

1598-
with ensure_clean(path) as path:
1657+
with tm.ensure_clean(path) as path:
15991658
for sep, dat in [('\t', data), (',', data2)]:
16001659
for enc in ['utf-16', 'utf-16le', 'utf-16be']:
16011660
bytes = dat.encode(enc)
@@ -1860,7 +1919,25 @@ def test_1000_fwf(self):
18601919
[10, 13, 10]]
18611920
df = read_fwf(StringIO(data), colspecs=[(0, 3), (3, 11), (12, 16)],
18621921
thousands=',')
1863-
assert_almost_equal(df.values, expected)
1922+
tm.assert_almost_equal(df.values, expected)
1923+
1924+
def test_1000_sep_with_decimal(self):
1925+
data = """A|B|C
1926+
1|2,334.01|5
1927+
10|13|10.
1928+
"""
1929+
1930+
expected = DataFrame({
1931+
'A': [1, 10],
1932+
'B': [2334.01, 13],
1933+
'C': [5, 10.]
1934+
})
1935+
1936+
df = self.read_csv(StringIO(data), sep='|', thousands=',')
1937+
tm.assert_frame_equal(df, expected)
1938+
1939+
df = self.read_table(StringIO(data), sep='|', thousands=',')
1940+
tm.assert_frame_equal(df, expected)
18641941

18651942
def test_comment_fwf(self):
18661943
data = """
@@ -1871,7 +1948,7 @@ def test_comment_fwf(self):
18711948
[5, np.nan, 10.]]
18721949
df = read_fwf(StringIO(data), colspecs=[(0, 3), (4, 9), (9, 25)],
18731950
comment='#')
1874-
assert_almost_equal(df.values, expected)
1951+
tm.assert_almost_equal(df.values, expected)
18751952

18761953
def test_fwf(self):
18771954
data_expected = """\
@@ -1993,7 +2070,7 @@ def test_iteration_open_handle(self):
19932070
if PY3:
19942071
raise nose.SkipTest
19952072

1996-
with ensure_clean() as path:
2073+
with tm.ensure_clean() as path:
19972074
with open(path, 'wb') as f:
19982075
f.write('AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG')
19992076

@@ -2212,7 +2289,7 @@ def test_decompression(self):
22122289
data = open(self.csv1, 'rb').read()
22132290
expected = self.read_csv(self.csv1)
22142291

2215-
with ensure_clean() as path:
2292+
with tm.ensure_clean() as path:
22162293
tmp = gzip.GzipFile(path, mode='wb')
22172294
tmp.write(data)
22182295
tmp.close()
@@ -2223,7 +2300,7 @@ def test_decompression(self):
22232300
result = self.read_csv(open(path, 'rb'), compression='gzip')
22242301
tm.assert_frame_equal(result, expected)
22252302

2226-
with ensure_clean() as path:
2303+
with tm.ensure_clean() as path:
22272304
tmp = bz2.BZ2File(path, mode='wb')
22282305
tmp.write(data)
22292306
tmp.close()
@@ -2248,15 +2325,15 @@ def test_decompression_regex_sep(self):
22482325
data = data.replace(b',', b'::')
22492326
expected = self.read_csv(self.csv1)
22502327

2251-
with ensure_clean() as path:
2328+
with tm.ensure_clean() as path:
22522329
tmp = gzip.GzipFile(path, mode='wb')
22532330
tmp.write(data)
22542331
tmp.close()
22552332

22562333
result = self.read_csv(path, sep='::', compression='gzip')
22572334
tm.assert_frame_equal(result, expected)
22582335

2259-
with ensure_clean() as path:
2336+
with tm.ensure_clean() as path:
22602337
tmp = bz2.BZ2File(path, mode='wb')
22612338
tmp.write(data)
22622339
tmp.close()
@@ -2470,7 +2547,7 @@ def test_convert_sql_column_decimals(self):
24702547

24712548
def assert_same_values_and_dtype(res, exp):
24722549
assert(res.dtype == exp.dtype)
2473-
assert_almost_equal(res, exp)
2550+
tm.assert_almost_equal(res, exp)
24742551

24752552

24762553
if __name__ == '__main__':

0 commit comments

Comments
 (0)