Skip to content

Commit 0922599

Browse files
committed
BUG: fixes issue pandas-dev#4322
Adds support for the thousands character in csv parser for floats. Updated docs to reflect bug fix.
1 parent cba88ed commit 0922599

File tree

7 files changed

+121
-25
lines changed

7 files changed

+121
-25
lines changed

doc/source/io.rst

+11-9
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,11 @@ They can take a number of arguments:
118118
date_converters.py
119119
- ``dayfirst``: if True then uses the DD/MM international/European date format
120120
(This is False by default)
121-
- ``thousands``: sepcifies the thousands separator. If not None, then parser
122-
will try to look for it in the output and parse relevant data to integers.
123-
Because it has to essentially scan through the data again, this causes a
121+
- ``thousands``: specifies the thousands separator. If not None, this character will
122+
be stripped from numeric dtypes. However, if it is the first character in a field,
123+
that column will be imported as a string. In the PythonParser, if not None,
124+
then parser will try to look for it in the output and parse relevant data to numeric
125+
dtypes. Because it has to essentially scan through the data again, this causes a
124126
significant performance hit so only use if necessary.
125127
- ``lineterminator`` : string (length 1), default ``None``, Character to break file into lines. Only valid with C parser
126128
- ``quotechar`` : string, The character to used to denote the start and end of a quoted item.
@@ -506,8 +508,8 @@ DD/MM/YYYY instead. For convenience, a ``dayfirst`` keyword is provided:
506508

507509
Thousand Separators
508510
~~~~~~~~~~~~~~~~~~~
509-
For large integers that have been written with a thousands separator, you can
510-
set the ``thousands`` keyword to ``True`` so that integers will be parsed
511+
For large numbers that have been written with a thousands separator, you can
512+
set the ``thousands`` keyword to a string of length 1 so that integers will be parsed
511513
correctly:
512514

513515
.. ipython:: python
@@ -521,7 +523,7 @@ correctly:
521523
with open('tmp.csv', 'w') as fh:
522524
fh.write(data)
523525
524-
By default, integers with a thousands separator will be parsed as strings
526+
By default, numbers with a thousands separator will be parsed as strings
525527

526528
.. ipython:: python
527529
@@ -1123,7 +1125,7 @@ is ``None``. To explicity force ``Series`` parsing, pass ``typ=series``
11231125
- ``numpy`` : direct decoding to numpy arrays. default is False;
11241126
Note that the JSON ordering **MUST** be the same for each term if ``numpy=True``
11251127
- ``precise_float`` : boolean, default ``False``. Set to enable usage of higher precision (strtod) function when decoding string to double values. Default (``False``) is to use fast but less precise builtin functionality
1126-
- ``date_unit`` : string, the timestamp unit to detect if converting dates. Default
1128+
- ``date_unit`` : string, the timestamp unit to detect if converting dates. Default
11271129
None. By default the timestamp precision will be detected, if this is not desired
11281130
then pass one of 's', 'ms', 'us' or 'ns' to force timestamp precision to
11291131
seconds, milliseconds, microseconds or nanoseconds respectively.
@@ -1201,11 +1203,11 @@ nanoseconds
12011203
dfju
12021204
12031205
# Let Pandas detect the correct precision
1204-
dfju = pd.read_json(json)
1206+
dfju = pd.read_json(json)
12051207
dfju
12061208
12071209
# Or specify that all timestamps are in nanoseconds
1208-
dfju = pd.read_json(json, date_unit='ns')
1210+
dfju = pd.read_json(json, date_unit='ns')
12091211
dfju
12101212
12111213
.. ipython:: python

doc/source/release.rst

+2
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,8 @@ See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`
272272
- Fix selection with ``ix/loc`` and non_unique selectors (:issue:`4619`)
273273
- Fix assignment with iloc/loc involving a dtype change in an existing column (:issue:`4312`)
274274
have internal setitem_with_indexer in core/indexing to use Block.setitem
275+
- Fixed bug where thousands operator was not handled correctly for floating point numbers
276+
in csv_import (:issue:`4322`)
275277

276278
pandas 0.12
277279
===========

doc/source/v0.13.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,9 @@ Bug Fixes
278278

279279
- Suppressed DeprecationWarning associated with internal calls issued by repr() (:issue:`4391`)
280280

281+
- Fixed bug where thousands operator was not handled correctly for floating point numbers
282+
in csv_import (:issue:`4322`)
283+
281284
See the :ref:`full release notes
282285
<release>` or issue tracker
283286
on GitHub for a complete list.

pandas/io/tests/test_parsers.py

+87-5
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@
1919
import pandas.io.parsers as parsers
2020
from pandas.io.parsers import (read_csv, read_table, read_fwf,
2121
TextFileReader, TextParser)
22-
from pandas.util.testing import (assert_almost_equal,
22+
from pandas.util.testing import (assert_equal,
23+
assert_almost_equal,
2324
assert_series_equal,
2425
makeCustomDataframe as mkdf,
2526
network,
@@ -67,6 +68,35 @@ def setUp(self):
6768
self.csv2 = os.path.join(self.dirpath, 'test2.csv')
6869
self.xls1 = os.path.join(self.dirpath, 'test.xls')
6970

71+
def test_multi_character_decimal_marker(self):
72+
data = """A|B|C
73+
1|2,334|5
74+
10|13|10.
75+
"""
76+
self.assertRaises(ValueError, read_csv, StringIO(data), decimal=',,')
77+
78+
def test_empty_decimal_marker(self):
79+
data = """A|B|C
80+
1|2,334|5
81+
10|13|10.
82+
"""
83+
self.assertRaises(ValueError, read_csv, StringIO(data), decimal='')
84+
85+
def test_empty_thousands_marker(self):
86+
data = """A|B|C
87+
1|2,334|5
88+
10|13|10.
89+
"""
90+
self.assertRaises(ValueError, read_csv, StringIO(data), thousands='')
91+
92+
93+
def test_multi_character_decimal_marker(self):
94+
data = """A|B|C
95+
1|2,334|5
96+
10|13|10.
97+
"""
98+
self.assertRaises(ValueError, read_csv, StringIO(data), thousands=',,')
99+
70100
def test_empty_string(self):
71101
data = """\
72102
One,Two,Three
@@ -164,14 +194,48 @@ def test_1000_sep(self):
164194
1|2,334|5
165195
10|13|10.
166196
"""
167-
expected = [[1, 2334., 5],
168-
[10, 13, 10]]
197+
expected = DataFrame({
198+
'A': [1, 10],
199+
'B': [2334, 13],
200+
'C': [5, 10.]
201+
})
169202

170203
df = self.read_csv(StringIO(data), sep='|', thousands=',')
171-
assert_almost_equal(df.values, expected)
204+
tm.assert_frame_equal(df, expected)
172205

173206
df = self.read_table(StringIO(data), sep='|', thousands=',')
174-
assert_almost_equal(df.values, expected)
207+
tm.assert_frame_equal(df, expected)
208+
209+
def test_1000_sep_with_decimal(self):
210+
data = """A|B|C
211+
1|2,334.01|5
212+
10|13|10.
213+
"""
214+
expected = DataFrame({
215+
'A': [1, 10],
216+
'B': [2334.01, 13],
217+
'C': [5, 10.]
218+
})
219+
220+
assert_equal(expected.A.dtype, 'int64')
221+
assert_equal(expected.B.dtype, 'float')
222+
assert_equal(expected.C.dtype, 'float')
223+
224+
df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
225+
tm.assert_frame_equal(df, expected)
226+
227+
df = self.read_table(StringIO(data), sep='|', thousands=',', decimal='.')
228+
tm.assert_frame_equal(df, expected)
229+
230+
data_with_odd_sep = """A|B|C
231+
1|2.334,01|5
232+
10|13|10,
233+
"""
234+
df = self.read_csv(StringIO(data_with_odd_sep), sep='|', thousands='.', decimal=',')
235+
tm.assert_frame_equal(df, expected)
236+
237+
df = self.read_table(StringIO(data_with_odd_sep), sep='|', thousands='.', decimal=',')
238+
tm.assert_frame_equal(df, expected)
175239

176240
def test_squeeze(self):
177241
data = """\
@@ -1862,6 +1926,24 @@ def test_1000_fwf(self):
18621926
thousands=',')
18631927
assert_almost_equal(df.values, expected)
18641928

1929+
def test_1000_sep_with_decimal(self):
1930+
data = """A|B|C
1931+
1|2,334.01|5
1932+
10|13|10.
1933+
"""
1934+
1935+
expected = DataFrame({
1936+
'A': [1, 10],
1937+
'B': [2334.01, 13],
1938+
'C': [5, 10.]
1939+
})
1940+
1941+
df = self.read_csv(StringIO(data), sep='|', thousands=',')
1942+
tm.assert_frame_equal(df, expected)
1943+
1944+
df = self.read_table(StringIO(data), sep='|', thousands=',')
1945+
tm.assert_frame_equal(df, expected)
1946+
18651947
def test_comment_fwf(self):
18661948
data = """
18671949
1 2. 4 #hello world

pandas/parser.pyx

+4-4
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ cdef extern from "parser/tokenizer.h":
186186
uint64_t str_to_uint64(char *p_item, uint64_t uint_max, int *error)
187187

188188
inline int to_double(char *item, double *p_value,
189-
char sci, char decimal)
189+
char sci, char decimal, char thousands)
190190
inline int to_complex(char *item, double *p_real,
191191
double *p_imag, char sci, char decimal)
192192
inline int to_longlong(char *item, long long *p_value)
@@ -355,7 +355,7 @@ cdef class TextReader:
355355

356356
if thousands is not None:
357357
if len(thousands) != 1:
358-
raise ValueError('Only length-1 decimal markers supported')
358+
raise ValueError('Only length-1 thousands markers supported')
359359
self.parser.thousands = ord(thousands)
360360

361361
if escapechar is not None:
@@ -1397,7 +1397,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
13971397
na_count += 1
13981398
data[0] = NA
13991399
else:
1400-
error = to_double(word, data, parser.sci, parser.decimal)
1400+
error = to_double(word, data, parser.sci, parser.decimal, parser.thousands)
14011401
if error != 1:
14021402
if strcasecmp(word, cinf) == 0:
14031403
data[0] = INF
@@ -1413,7 +1413,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
14131413
else:
14141414
for i in range(lines):
14151415
word = COLITER_NEXT(it)
1416-
error = to_double(word, data, parser.sci, parser.decimal)
1416+
error = to_double(word, data, parser.sci, parser.decimal, parser.thousands)
14171417
if error != 1:
14181418
if strcasecmp(word, cinf) == 0:
14191419
data[0] = INF

pandas/src/parser/tokenizer.c

+13-6
Original file line numberDiff line numberDiff line change
@@ -1633,7 +1633,7 @@ void test_count_lines(char *fname) {
16331633

16341634

16351635
// forward declaration
1636-
static double xstrtod(const char *p, char **q, char decimal, char sci, int skip_trailing);
1636+
static double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing);
16371637

16381638

16391639
P_INLINE void lowercase(char *p) {
@@ -1661,11 +1661,11 @@ P_INLINE void uppercase(char *p) {
16611661
*
16621662
*/
16631663

1664-
int to_double(char *item, double *p_value, char sci, char decimal)
1664+
int to_double(char *item, double *p_value, char sci, char decimal, char tsep)
16651665
{
16661666
char *p_end;
16671667

1668-
*p_value = xstrtod(item, &p_end, decimal, sci, TRUE);
1668+
*p_value = xstrtod(item, &p_end, decimal, sci, tsep, TRUE);
16691669

16701670
return (errno == 0) && (!*p_end);
16711671
}
@@ -1675,7 +1675,7 @@ int P_INLINE to_complex(char *item, double *p_real, double *p_imag, char sci, ch
16751675
{
16761676
char *p_end;
16771677

1678-
*p_real = xstrtod(item, &p_end, decimal, sci, FALSE);
1678+
*p_real = xstrtod(item, &p_end, decimal, sci, '\0', FALSE);
16791679
if (*p_end == '\0') {
16801680
*p_imag = 0.0;
16811681
return errno == 0;
@@ -1689,7 +1689,7 @@ int P_INLINE to_complex(char *item, double *p_real, double *p_imag, char sci, ch
16891689
if (*p_end == '+') {
16901690
++p_end;
16911691
}
1692-
*p_imag = xstrtod(p_end, &p_end, decimal, sci, FALSE);
1692+
*p_imag = xstrtod(p_end, &p_end, decimal, sci, '\0', FALSE);
16931693
if (errno || ((*p_end != 'i') && (*p_end != 'j'))) {
16941694
return FALSE;
16951695
}
@@ -1856,10 +1856,12 @@ int main(int argc, char *argv[])
18561856
// * Added decimal and sci arguments.
18571857
// * Skip trailing spaces.
18581858
// * Commented out the other functions.
1859+
// Modifications by Richard T Guy, August 2013:
1860+
// * Add tsep argument for thousands separator
18591861
//
18601862

18611863
static double xstrtod(const char *str, char **endptr, char decimal,
1862-
char sci, int skip_trailing)
1864+
char sci, char tsep, int skip_trailing)
18631865
{
18641866
double number;
18651867
int exponent;
@@ -1894,6 +1896,11 @@ static double xstrtod(const char *str, char **endptr, char decimal,
18941896
number = number * 10. + (*p - '0');
18951897
p++;
18961898
num_digits++;
1899+
1900+
if (tsep != '\0' && *p == tsep)
1901+
{
1902+
++p;
1903+
}
18971904
}
18981905

18991906
// Process decimal part

pandas/src/parser/tokenizer.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min,
255255
int64_t int_max, int *error, char tsep);
256256
uint64_t str_to_uint64(const char *p_item, uint64_t uint_max, int *error);
257257

258-
int P_INLINE to_double(char *item, double *p_value, char sci, char decimal);
258+
int P_INLINE to_double(char *item, double *p_value, char sci, char decimal, char tsep);
259259
int P_INLINE to_complex(char *item, double *p_real, double *p_imag, char sci, char decimal);
260260
int P_INLINE to_longlong(char *item, long long *p_value);
261261
int P_INLINE to_longlong_thousands(char *item, long long *p_value, char tsep);

0 commit comments

Comments
 (0)