diff --git a/doc/source/io.rst b/doc/source/io.rst index ba68943f1324a..f09ae1563f71b 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -118,9 +118,11 @@ They can take a number of arguments: date_converters.py - ``dayfirst``: if True then uses the DD/MM international/European date format (This is False by default) - - ``thousands``: sepcifies the thousands separator. If not None, then parser - will try to look for it in the output and parse relevant data to integers. - Because it has to essentially scan through the data again, this causes a + - ``thousands``: specifies the thousands separator. If not None, this character will + be stripped from numeric dtypes. However, if it is the first character in a field, + that column will be imported as a string. In the PythonParser, if not None, + then parser will try to look for it in the output and parse relevant data to numeric + dtypes. Because it has to essentially scan through the data again, this causes a significant performance hit so only use if necessary. - ``lineterminator`` : string (length 1), default ``None``, Character to break file into lines. Only valid with C parser - ``quotechar`` : string, The character to used to denote the start and end of a quoted item. @@ -506,8 +508,8 @@ DD/MM/YYYY instead. For convenience, a ``dayfirst`` keyword is provided: Thousand Separators ~~~~~~~~~~~~~~~~~~~ -For large integers that have been written with a thousands separator, you can -set the ``thousands`` keyword to ``True`` so that integers will be parsed +For large numbers that have been written with a thousands separator, you can +set the ``thousands`` keyword to a string of length 1 so that integers will be parsed correctly: .. ipython:: python @@ -521,7 +523,7 @@ correctly: with open('tmp.csv', 'w') as fh: fh.write(data) -By default, integers with a thousands separator will be parsed as strings +By default, numbers with a thousands separator will be parsed as strings .. ipython:: python @@ -1123,7 +1125,7 @@ is ``None``. To explicity force ``Series`` parsing, pass ``typ=series`` - ``numpy`` : direct decoding to numpy arrays. default is False; Note that the JSON ordering **MUST** be the same for each term if ``numpy=True`` - ``precise_float`` : boolean, default ``False``. Set to enable usage of higher precision (strtod) function when decoding string to double values. Default (``False``) is to use fast but less precise builtin functionality -- ``date_unit`` : string, the timestamp unit to detect if converting dates. Default +- ``date_unit`` : string, the timestamp unit to detect if converting dates. Default None. By default the timestamp precision will be detected, if this is not desired then pass one of 's', 'ms', 'us' or 'ns' to force timestamp precision to seconds, milliseconds, microseconds or nanoseconds respectively. @@ -1201,11 +1203,11 @@ nanoseconds dfju # Let Pandas detect the correct precision - dfju = pd.read_json(json) + dfju = pd.read_json(json) dfju # Or specify that all timestamps are in nanoseconds - dfju = pd.read_json(json, date_unit='ns') + dfju = pd.read_json(json, date_unit='ns') dfju .. ipython:: python diff --git a/doc/source/release.rst b/doc/source/release.rst index 14ed2cab10eac..c7a3eb34d07f9 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -272,6 +272,8 @@ See :ref:`Internal Refactoring` - Fix selection with ``ix/loc`` and non_unique selectors (:issue:`4619`) - Fix assignment with iloc/loc involving a dtype change in an existing column (:issue:`4312`) have internal setitem_with_indexer in core/indexing to use Block.setitem + - Fixed bug where thousands operator was not handled correctly for floating point numbers + in csv_import (:issue:`4322`) pandas 0.12 =========== diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index 6e735fb5de1e5..fea84f83daa46 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -278,6 +278,9 @@ Bug Fixes - Suppressed DeprecationWarning associated with internal calls issued by repr() (:issue:`4391`) + - Fixed bug where thousands operator was not handled correctly for floating point numbers + in csv_import (:issue:`4322`) + See the :ref:`full release notes ` or issue tracker on GitHub for a complete list. diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 787682f340250..6668cfd73a6b7 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -13,17 +13,13 @@ from pandas import DataFrame, Series, Index, MultiIndex, DatetimeIndex from pandas.compat import( - StringIO, BytesIO, PY3, range, long, lrange, lmap, u, map, StringIO + StringIO, BytesIO, PY3, range, long, lrange, lmap, u ) from pandas.io.common import urlopen, URLError import pandas.io.parsers as parsers from pandas.io.parsers import (read_csv, read_table, read_fwf, TextFileReader, TextParser) -from pandas.util.testing import (assert_almost_equal, - assert_series_equal, - makeCustomDataframe as mkdf, - network, - ensure_clean) + import pandas.util.testing as tm import pandas as pd @@ -67,6 +63,35 @@ def setUp(self): self.csv2 = os.path.join(self.dirpath, 'test2.csv') self.xls1 = os.path.join(self.dirpath, 'test.xls') + def test_multi_character_decimal_marker(self): + data = """A|B|C +1|2,334|5 +10|13|10. +""" + self.assertRaises(ValueError, read_csv, StringIO(data), decimal=',,') + + def test_empty_decimal_marker(self): + data = """A|B|C +1|2,334|5 +10|13|10. +""" + self.assertRaises(ValueError, read_csv, StringIO(data), decimal='') + + def test_empty_thousands_marker(self): + data = """A|B|C +1|2,334|5 +10|13|10. +""" + self.assertRaises(ValueError, read_csv, StringIO(data), thousands='') + + + def test_multi_character_decimal_marker(self): + data = """A|B|C +1|2,334|5 +10|13|10. +""" + self.assertRaises(ValueError, read_csv, StringIO(data), thousands=',,') + def test_empty_string(self): data = """\ One,Two,Three @@ -164,14 +189,48 @@ def test_1000_sep(self): 1|2,334|5 10|13|10. """ - expected = [[1, 2334., 5], - [10, 13, 10]] + expected = DataFrame({ + 'A': [1, 10], + 'B': [2334, 13], + 'C': [5, 10.] + }) df = self.read_csv(StringIO(data), sep='|', thousands=',') - assert_almost_equal(df.values, expected) + tm.assert_frame_equal(df, expected) df = self.read_table(StringIO(data), sep='|', thousands=',') - assert_almost_equal(df.values, expected) + tm.assert_frame_equal(df, expected) + + def test_1000_sep_with_decimal(self): + data = """A|B|C +1|2,334.01|5 +10|13|10. +""" + expected = DataFrame({ + 'A': [1, 10], + 'B': [2334.01, 13], + 'C': [5, 10.] + }) + + tm.assert_equal(expected.A.dtype, 'int64') + tm.assert_equal(expected.B.dtype, 'float') + tm.assert_equal(expected.C.dtype, 'float') + + df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.') + tm.assert_frame_equal(df, expected) + + df = self.read_table(StringIO(data), sep='|', thousands=',', decimal='.') + tm.assert_frame_equal(df, expected) + + data_with_odd_sep = """A|B|C +1|2.334,01|5 +10|13|10, +""" + df = self.read_csv(StringIO(data_with_odd_sep), sep='|', thousands='.', decimal=',') + tm.assert_frame_equal(df, expected) + + df = self.read_table(StringIO(data_with_odd_sep), sep='|', thousands='.', decimal=',') + tm.assert_frame_equal(df, expected) def test_squeeze(self): data = """\ @@ -183,7 +242,7 @@ def test_squeeze(self): result = self.read_table(StringIO(data), sep=',', index_col=0, header=None, squeeze=True) tm.assert_isinstance(result, Series) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_inf_parsing(self): data = """\ @@ -201,9 +260,9 @@ def test_inf_parsing(self): inf = float('inf') expected = Series([inf, -inf] * 5) df = read_csv(StringIO(data), index_col=0) - assert_almost_equal(df['A'].values, expected.values) + tm.assert_almost_equal(df['A'].values, expected.values) df = read_csv(StringIO(data), index_col=0, na_filter=False) - assert_almost_equal(df['A'].values, expected.values) + tm.assert_almost_equal(df['A'].values, expected.values) def test_multiple_date_col(self): # Can use multiple date parsers @@ -524,7 +583,7 @@ def test_passing_dtype(self): df = DataFrame(np.random.rand(5,2),columns=list('AB'),index=['1A','1B','1C','1D','1E']) - with ensure_clean('__passing_str_as_dtype__.csv') as path: + with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: df.to_csv(path) # GH 3795 @@ -566,7 +625,7 @@ def test_quoting(self): def test_non_string_na_values(self): # GH3611, na_values that are not a string are an issue - with ensure_clean('__non_string_na_values__.csv') as path: + with tm.ensure_clean('__non_string_na_values__.csv') as path: df = DataFrame({'A' : [-999, 2, 3], 'B' : [1.2, -999, 4.5]}) df.to_csv(path, sep=' ', index=False) result1 = read_csv(path, sep= ' ', header=0, na_values=['-999.0','-999']) @@ -617,15 +676,15 @@ def test_custom_na_values(self): [7, 8, nan]] df = self.read_csv(StringIO(data), na_values=['baz'], skiprows=[1]) - assert_almost_equal(df.values, expected) + tm.assert_almost_equal(df.values, expected) df2 = self.read_table(StringIO(data), sep=',', na_values=['baz'], skiprows=[1]) - assert_almost_equal(df2.values, expected) + tm.assert_almost_equal(df2.values, expected) df3 = self.read_table(StringIO(data), sep=',', na_values='baz', skiprows=[1]) - assert_almost_equal(df3.values, expected) + tm.assert_almost_equal(df3.values, expected) def test_nat_parse(self): @@ -635,7 +694,7 @@ def test_nat_parse(self): 'B' : pd.Timestamp('20010101') })) df.iloc[3:6,:] = np.nan - with ensure_clean('__nat_parse_.csv') as path: + with tm.ensure_clean('__nat_parse_.csv') as path: df.to_csv(path) result = read_csv(path,index_col=0,parse_dates=['B']) tm.assert_frame_equal(result,df) @@ -686,7 +745,7 @@ def test_detect_string_na(self): [nan, nan]] df = self.read_csv(StringIO(data)) - assert_almost_equal(df.values, expected) + tm.assert_almost_equal(df.values, expected) def test_unnamed_columns(self): data = """A,B,C,, @@ -698,7 +757,7 @@ def test_unnamed_columns(self): [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]] df = self.read_table(StringIO(data), sep=',') - assert_almost_equal(df.values, expected) + tm.assert_almost_equal(df.values, expected) self.assert_(np.array_equal(df.columns, ['A', 'B', 'C', 'Unnamed: 3', 'Unnamed: 4'])) @@ -849,8 +908,8 @@ def test_no_header(self): expected = [[1, 2, 3, 4, 5.], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]] - assert_almost_equal(df.values, expected) - assert_almost_equal(df.values, df2.values) + tm.assert_almost_equal(df.values, expected) + tm.assert_almost_equal(df.values, df2.values) self.assert_(np.array_equal(df_pref.columns, ['X0', 'X1', 'X2', 'X3', 'X4'])) @@ -1113,7 +1172,7 @@ def test_header_not_first_line(self): tm.assert_frame_equal(df, expected) def test_header_multi_index(self): - expected = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) + expected = tm.makeCustomDataframe(5,3,r_idx_nlevels=2,c_idx_nlevels=4) data = """\ C0,,C_l0_g0,C_l0_g1,C_l0_g2 @@ -1413,7 +1472,7 @@ def test_na_value_dict(self): tm.assert_frame_equal(df, xp) @slow - @network + @tm.network def test_url(self): try: # HTTP(S) @@ -1428,7 +1487,7 @@ def test_url(self): except URLError: try: - with closing(urlopen('http://www.google.com')) as resp: + with tm.closing(urlopen('http://www.google.com')) as resp: pass except URLError: raise nose.SkipTest @@ -1533,11 +1592,11 @@ def test_comment(self): expected = [[1., 2., 4.], [5., np.nan, 10.]] df = self.read_csv(StringIO(data), comment='#') - assert_almost_equal(df.values, expected) + tm.assert_almost_equal(df.values, expected) df = self.read_table(StringIO(data), sep=',', comment='#', na_values=['NaN']) - assert_almost_equal(df.values, expected) + tm.assert_almost_equal(df.values, expected) def test_bool_na_values(self): data = """A,B,C @@ -1595,7 +1654,7 @@ def test_utf16_bom_skiprows(self): path = '__%s__.csv' % tm.rands(10) - with ensure_clean(path) as path: + with tm.ensure_clean(path) as path: for sep, dat in [('\t', data), (',', data2)]: for enc in ['utf-16', 'utf-16le', 'utf-16be']: bytes = dat.encode(enc) @@ -1860,7 +1919,25 @@ def test_1000_fwf(self): [10, 13, 10]] df = read_fwf(StringIO(data), colspecs=[(0, 3), (3, 11), (12, 16)], thousands=',') - assert_almost_equal(df.values, expected) + tm.assert_almost_equal(df.values, expected) + + def test_1000_sep_with_decimal(self): + data = """A|B|C +1|2,334.01|5 +10|13|10. +""" + + expected = DataFrame({ + 'A': [1, 10], + 'B': [2334.01, 13], + 'C': [5, 10.] + }) + + df = self.read_csv(StringIO(data), sep='|', thousands=',') + tm.assert_frame_equal(df, expected) + + df = self.read_table(StringIO(data), sep='|', thousands=',') + tm.assert_frame_equal(df, expected) def test_comment_fwf(self): data = """ @@ -1871,7 +1948,7 @@ def test_comment_fwf(self): [5, np.nan, 10.]] df = read_fwf(StringIO(data), colspecs=[(0, 3), (4, 9), (9, 25)], comment='#') - assert_almost_equal(df.values, expected) + tm.assert_almost_equal(df.values, expected) def test_fwf(self): data_expected = """\ @@ -1993,7 +2070,7 @@ def test_iteration_open_handle(self): if PY3: raise nose.SkipTest - with ensure_clean() as path: + with tm.ensure_clean() as path: with open(path, 'wb') as f: f.write('AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG') @@ -2212,7 +2289,7 @@ def test_decompression(self): data = open(self.csv1, 'rb').read() expected = self.read_csv(self.csv1) - with ensure_clean() as path: + with tm.ensure_clean() as path: tmp = gzip.GzipFile(path, mode='wb') tmp.write(data) tmp.close() @@ -2223,7 +2300,7 @@ def test_decompression(self): result = self.read_csv(open(path, 'rb'), compression='gzip') tm.assert_frame_equal(result, expected) - with ensure_clean() as path: + with tm.ensure_clean() as path: tmp = bz2.BZ2File(path, mode='wb') tmp.write(data) tmp.close() @@ -2248,7 +2325,7 @@ def test_decompression_regex_sep(self): data = data.replace(b',', b'::') expected = self.read_csv(self.csv1) - with ensure_clean() as path: + with tm.ensure_clean() as path: tmp = gzip.GzipFile(path, mode='wb') tmp.write(data) tmp.close() @@ -2256,7 +2333,7 @@ def test_decompression_regex_sep(self): result = self.read_csv(path, sep='::', compression='gzip') tm.assert_frame_equal(result, expected) - with ensure_clean() as path: + with tm.ensure_clean() as path: tmp = bz2.BZ2File(path, mode='wb') tmp.write(data) tmp.close() @@ -2470,7 +2547,7 @@ def test_convert_sql_column_decimals(self): def assert_same_values_and_dtype(res, exp): assert(res.dtype == exp.dtype) - assert_almost_equal(res, exp) + tm.assert_almost_equal(res, exp) if __name__ == '__main__': diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 36055e681a706..8b90e76fa4bf3 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -186,7 +186,7 @@ cdef extern from "parser/tokenizer.h": uint64_t str_to_uint64(char *p_item, uint64_t uint_max, int *error) inline int to_double(char *item, double *p_value, - char sci, char decimal) + char sci, char decimal, char thousands) inline int to_complex(char *item, double *p_real, double *p_imag, char sci, char decimal) inline int to_longlong(char *item, long long *p_value) @@ -355,7 +355,7 @@ cdef class TextReader: if thousands is not None: if len(thousands) != 1: - raise ValueError('Only length-1 decimal markers supported') + raise ValueError('Only length-1 thousands markers supported') self.parser.thousands = ord(thousands) if escapechar is not None: @@ -1397,7 +1397,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, na_count += 1 data[0] = NA else: - error = to_double(word, data, parser.sci, parser.decimal) + error = to_double(word, data, parser.sci, parser.decimal, parser.thousands) if error != 1: if strcasecmp(word, cinf) == 0: data[0] = INF @@ -1413,7 +1413,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, else: for i in range(lines): word = COLITER_NEXT(it) - error = to_double(word, data, parser.sci, parser.decimal) + error = to_double(word, data, parser.sci, parser.decimal, parser.thousands) if error != 1: if strcasecmp(word, cinf) == 0: data[0] = INF diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index cad5d98dde53a..45b8b9263e9cd 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -1633,7 +1633,7 @@ void test_count_lines(char *fname) { // forward declaration -static double xstrtod(const char *p, char **q, char decimal, char sci, int skip_trailing); +static double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing); P_INLINE void lowercase(char *p) { @@ -1661,11 +1661,11 @@ P_INLINE void uppercase(char *p) { * */ -int to_double(char *item, double *p_value, char sci, char decimal) +int to_double(char *item, double *p_value, char sci, char decimal, char tsep) { char *p_end; - *p_value = xstrtod(item, &p_end, decimal, sci, TRUE); + *p_value = xstrtod(item, &p_end, decimal, sci, tsep, TRUE); return (errno == 0) && (!*p_end); } @@ -1675,7 +1675,7 @@ int P_INLINE to_complex(char *item, double *p_real, double *p_imag, char sci, ch { char *p_end; - *p_real = xstrtod(item, &p_end, decimal, sci, FALSE); + *p_real = xstrtod(item, &p_end, decimal, sci, '\0', FALSE); if (*p_end == '\0') { *p_imag = 0.0; return errno == 0; @@ -1689,7 +1689,7 @@ int P_INLINE to_complex(char *item, double *p_real, double *p_imag, char sci, ch if (*p_end == '+') { ++p_end; } - *p_imag = xstrtod(p_end, &p_end, decimal, sci, FALSE); + *p_imag = xstrtod(p_end, &p_end, decimal, sci, '\0', FALSE); if (errno || ((*p_end != 'i') && (*p_end != 'j'))) { return FALSE; } @@ -1856,10 +1856,12 @@ int main(int argc, char *argv[]) // * Added decimal and sci arguments. // * Skip trailing spaces. // * Commented out the other functions. +// Modifications by Richard T Guy, August 2013: +// * Add tsep argument for thousands separator // static double xstrtod(const char *str, char **endptr, char decimal, - char sci, int skip_trailing) + char sci, char tsep, int skip_trailing) { double number; int exponent; @@ -1894,6 +1896,8 @@ static double xstrtod(const char *str, char **endptr, char decimal, number = number * 10. + (*p - '0'); p++; num_digits++; + + p += (tsep != '\0' & *p == tsep); } // Process decimal part diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h index 01f9397685da6..69f627dda554c 100644 --- a/pandas/src/parser/tokenizer.h +++ b/pandas/src/parser/tokenizer.h @@ -255,7 +255,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep); uint64_t str_to_uint64(const char *p_item, uint64_t uint_max, int *error); -int P_INLINE to_double(char *item, double *p_value, char sci, char decimal); +int P_INLINE to_double(char *item, double *p_value, char sci, char decimal, char tsep); int P_INLINE to_complex(char *item, double *p_real, double *p_imag, char sci, char decimal); int P_INLINE to_longlong(char *item, long long *p_value); int P_INLINE to_longlong_thousands(char *item, long long *p_value, char tsep);