Skip to content

BUG: Conflict between thousands sep and date parser. #4945

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Sep 26, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,7 @@ Bug Fixes
- Fixed a bug in ``convert_objects`` for > 2 ndims (:issue:`4937`)
- Fixed a bug in DataFrame/Panel cache insertion and subsequent indexing (:issue:`4939`)
- Fixed string methods for ``FrozenNDArray`` and ``FrozenList`` (:issue:`4929`)
- Fixed conflict between thousands separator and date parser in csv_parser (:issue:`4678`)

pandas 0.12.0
-------------
Expand Down
2 changes: 1 addition & 1 deletion pandas/io/date_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col,
minute_col = _maybe_cast(minute_col)
second_col = _maybe_cast(second_col)
return lib.try_parse_datetime_components(year_col, month_col, day_col,
hour_col, minute_col, second_col)
hour_col, minute_col, second_col)


def generic_parser(parse_func, *cols):
Expand Down
46 changes: 43 additions & 3 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1020,6 +1020,14 @@ def _set(x):
else:
_set(val)

elif isinstance(self.parse_dates, dict):
for val in self.parse_dates.values():
if isinstance(val, list):
for k in val:
_set(k)
else:
_set(val)

def set_error_bad_lines(self, status):
self._reader.set_error_bad_lines(int(status))

Expand Down Expand Up @@ -1269,6 +1277,7 @@ def __init__(self, f, **kwds):
self._make_reader(f)
else:
self.data = f

self.columns = self._infer_columns()

# we are processing a multi index column
Expand All @@ -1292,6 +1301,38 @@ def __init__(self, f, **kwds):
self.index_names = index_names
self._first_chunk = True

if self.parse_dates:
self._no_thousands_columns = self._set_no_thousands_columns()
else:
self._no_thousands_columns = None

def _set_no_thousands_columns(self):
# Create a set of column ids that are not to be stripped of thousands operators.
noconvert_columns = set()

def _set(x):
if com.is_integer(x):
noconvert_columns.add(x)
else:
noconvert_columns.add(self.columns.index(x))

if isinstance(self.parse_dates, list):
for val in self.parse_dates:
if isinstance(val, list):
for k in val:
_set(k)
else:
_set(val)

elif isinstance(self.parse_dates, dict):
for val in self.parse_dates.values():
if isinstance(val, list):
for k in val:
_set(k)
else:
_set(val)
return noconvert_columns

def _make_reader(self, f):
sep = self.delimiter

Expand Down Expand Up @@ -1500,7 +1541,6 @@ def _next_line(self):
line = next(self.data)

line = self._check_comments([line])[0]
line = self._check_thousands([line])[0]

self.pos += 1
self.buf.append(line)
Expand Down Expand Up @@ -1532,9 +1572,10 @@ def _check_thousands(self, lines):
ret = []
for l in lines:
rl = []
for x in l:
for i, x in enumerate(l):
if (not isinstance(x, compat.string_types) or
self.thousands not in x or
(self._no_thousands_columns and i in self._no_thousands_columns) or
nonnum.search(x.strip())):
rl.append(x)
else:
Expand Down Expand Up @@ -1608,7 +1649,6 @@ def _rows_to_cols(self, content):
raise AssertionError()

if col_len != zip_len and self.index_col is not False:
row_num = -1
i = 0
for (i, l) in enumerate(content):
if len(l) != col_len:
Expand Down
12 changes: 12 additions & 0 deletions pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,18 @@ def test_1000_sep_with_decimal(self):
df = self.read_table(StringIO(data_with_odd_sep), sep='|', thousands='.', decimal=',')
tm.assert_frame_equal(df, expected)

def test_separator_date_conflict(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this runs or both PythonParser and c-parse?

# Regression test for issue #4678: make sure thousands separator and
# date parsing do not conflict.
data = '06-02-2013;13:00;1-000.215'
expected = DataFrame(
[[datetime(2013, 6, 2, 13, 0, 0), 1000.215]],
columns=['Date', 2]
)

df = self.read_csv(StringIO(data), sep=';', thousands='-', parse_dates={'Date': [0, 1]}, header=None)
tm.assert_frame_equal(df, expected)

def test_squeeze(self):
data = """\
a,1
Expand Down
10 changes: 6 additions & 4 deletions pandas/src/inference.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -708,20 +708,22 @@ def try_parse_datetime_components(ndarray[object] years,
Py_ssize_t i, n
ndarray[object] result
int secs
double float_secs
double micros

from datetime import datetime

n = len(years)
if (len(months) != n and len(days) != n and len(hours) != n and
len(minutes) != n and len(seconds) != n):
if (len(months) != n or len(days) != n or len(hours) != n or
len(minutes) != n or len(seconds) != n):
raise ValueError('Length of all datetime components must be equal')
result = np.empty(n, dtype='O')

for i from 0 <= i < n:
secs = int(seconds[i])
float_secs = float(seconds[i])
secs = int(float_secs)

micros = seconds[i] - secs
micros = float_secs - secs
if micros > 0:
micros = micros * 1000000

Expand Down