Skip to content

BUG: NaN values not converted to Stata missing values (GH6684) #6685

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 23, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,7 @@ Bug Fixes
- Bug in ``DataFrame.to_stata`` when columns have non-string names (:issue:`4558`)
- Bug in compat with ``np.compress``, surfaced in (:issue:`6658`)
- Bug in binary operations with a rhs of a Series not aligning (:issue:`6681`)
- Bug in ``DataFrame.to_stata`` which incorrectly handles nan values and ignores 'with_index' keyword argument (:issue:`6685`)

pandas 0.13.1
-------------
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1258,7 +1258,8 @@ def to_stata(
from pandas.io.stata import StataWriter
writer = StataWriter(fname, self, convert_dates=convert_dates,
encoding=encoding, byteorder=byteorder,
time_stamp=time_stamp, data_label=data_label)
time_stamp=time_stamp, data_label=data_label,
write_index=write_index)
writer.write_file()

@Appender(fmt.docstring_to_string, indents=1)
Expand Down
23 changes: 17 additions & 6 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -990,8 +990,6 @@ def _dtype_to_stata_type(dtype):
return chr(255)
elif dtype == np.float32:
return chr(254)
elif dtype == np.int64:
return chr(253)
elif dtype == np.int32:
return chr(253)
elif dtype == np.int16:
Expand Down Expand Up @@ -1025,8 +1023,6 @@ def _dtype_to_default_stata_fmt(dtype):
return "%10.0g"
elif dtype == np.float32:
return "%9.0g"
elif dtype == np.int64:
return "%9.0g"
elif dtype == np.int32:
return "%12.0g"
elif dtype == np.int8 or dtype == np.int16:
Expand Down Expand Up @@ -1108,6 +1104,21 @@ def _write(self, to_write):
self._file.write(to_write)


def _replace_nans(self, data):
# return data
"""Checks floating point data columns for nans, and replaces these with
the generic Stata for missing value (.)"""
for c in data:
dtype = data[c].dtype
if dtype in (np.float32, np.float64):
if dtype == np.float32:
replacement = self.MISSING_VALUES['f']
else:
replacement = self.MISSING_VALUES['d']
data[c] = data[c].fillna(replacement)

return data

def _check_column_names(self, data):
"""Checks column names to ensure that they are valid Stata column names.
This includes checks for:
Expand Down Expand Up @@ -1197,6 +1208,8 @@ def __iter__(self):
data = _cast_to_stata_types(data)
# Ensure column names are strings
data = self._check_column_names(data)
# Replace NaNs with Stata missing values
data = self._replace_nans(data)
self.datarows = DataFrameRowIter(data)
self.nobs, self.nvar = data.shape
self.data = data
Expand Down Expand Up @@ -1340,8 +1353,6 @@ def _write_data_dates(self):
var = _pad_bytes(var, typ)
self._write(var)
else:
if isnull(var): # this only matters for floats
var = MISSING_VALUES[TYPE_MAP[typ]]
self._file.write(struct.pack(byteorder+TYPE_MAP[typ], var))

def _null_terminate(self, s, as_string=False):
Expand Down
124 changes: 109 additions & 15 deletions pandas/io/tests/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
import pandas as pd
from pandas.core.frame import DataFrame, Series
from pandas.io.parsers import read_csv
from pandas.io.stata import read_stata, StataReader, InvalidColumnName
from pandas.io.stata import (read_stata, StataReader, InvalidColumnName,
PossiblePrecisionLoss)
import pandas.util.testing as tm
from pandas.util.misc import is_little_endian
from pandas import compat
Expand Down Expand Up @@ -142,8 +143,7 @@ def test_read_dta2(self):
parsed_117 = self.read_dta(self.dta2_117)
# 113 is buggy due ot limits date format support in Stata
# parsed_113 = self.read_dta(self.dta2_113)

np.testing.assert_equal(
tm.assert_equal(
len(w), 1) # should get a warning for that format.

# buggy test because of the NaT comparison on certain platforms
Expand Down Expand Up @@ -206,7 +206,7 @@ def test_read_write_dta5(self):
original.index.name = 'index'

with tm.ensure_clean() as path:
original.to_stata(path, None, False)
original.to_stata(path, None)
written_and_read_again = self.read_dta(path)
tm.assert_frame_equal(written_and_read_again.set_index('index'),
original)
Expand All @@ -221,7 +221,7 @@ def test_write_dta6(self):
original['quarter'] = original['quarter'].astype(np.int32)

with tm.ensure_clean() as path:
original.to_stata(path, None, False)
original.to_stata(path, None)
written_and_read_again = self.read_dta(path)
tm.assert_frame_equal(written_and_read_again.set_index('index'),
original)
Expand Down Expand Up @@ -257,7 +257,7 @@ def test_read_write_dta10(self):
original['integer'] = original['integer'].astype(np.int32)

with tm.ensure_clean() as path:
original.to_stata(path, {'datetime': 'tc'}, False)
original.to_stata(path, {'datetime': 'tc'})
written_and_read_again = self.read_dta(path)
tm.assert_frame_equal(written_and_read_again.set_index('index'),
original)
Expand Down Expand Up @@ -295,9 +295,9 @@ def test_read_write_dta11(self):

with tm.ensure_clean() as path:
with warnings.catch_warnings(record=True) as w:
original.to_stata(path, None, False)
np.testing.assert_equal(
len(w), 1) # should get a warning for that format.
original.to_stata(path, None)
# should get a warning for that format.
tm.assert_equal(len(w), 1)

written_and_read_again = self.read_dta(path)
tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)
Expand All @@ -324,13 +324,12 @@ def test_read_write_dta12(self):

with tm.ensure_clean() as path:
with warnings.catch_warnings(record=True) as w:
original.to_stata(path, None, False)
np.testing.assert_equal(
len(w), 1) # should get a warning for that format.
original.to_stata(path, None)
tm.assert_equal(len(w), 1) # should get a warning for that format.

written_and_read_again = self.read_dta(path)
tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)

def test_read_write_dta13(self):
s1 = Series(2**9, dtype=np.int16)
s2 = Series(2**17, dtype=np.int32)
Expand Down Expand Up @@ -366,7 +365,7 @@ def test_read_write_reread_dta14(self):
tm.assert_frame_equal(parsed_114, parsed_115)

with tm.ensure_clean() as path:
parsed_114.to_stata(path, {'date_td': 'td'}, write_index=False)
parsed_114.to_stata(path, {'date_td': 'td'})
written_and_read_again = self.read_dta(path)
tm.assert_frame_equal(written_and_read_again.set_index('index'), parsed_114)

Expand Down Expand Up @@ -406,7 +405,7 @@ def test_numeric_column_names(self):
with warnings.catch_warnings(record=True) as w:
tm.assert_produces_warning(original.to_stata(path), InvalidColumnName)
# should produce a single warning
np.testing.assert_equal(len(w), 1)
tm.assert_equal(len(w), 1)

written_and_read_again = self.read_dta(path)
written_and_read_again = written_and_read_again.set_index('index')
Expand All @@ -415,7 +414,102 @@ def test_numeric_column_names(self):
written_and_read_again.columns = map(convert_col_name, columns)
tm.assert_frame_equal(original, written_and_read_again)

def test_nan_to_missing_value(self):
s1 = Series(np.arange(4.0), dtype=np.float32)
s2 = Series(np.arange(4.0), dtype=np.float64)
s1[::2] = np.nan
s2[1::2] = np.nan
original = DataFrame({'s1': s1, 's2': s2})
original.index.name = 'index'
with tm.ensure_clean() as path:
original.to_stata(path)
written_and_read_again = self.read_dta(path)
written_and_read_again = written_and_read_again.set_index('index')
tm.assert_frame_equal(written_and_read_again, original)

def test_no_index(self):
columns = ['x', 'y']
original = DataFrame(np.reshape(np.arange(10.0), (5, 2)),
columns=columns)
original.index.name = 'index_not_written'
with tm.ensure_clean() as path:
original.to_stata(path, write_index=False)
written_and_read_again = self.read_dta(path)
tm.assertRaises(KeyError,
lambda: written_and_read_again['index_not_written'])

def test_string_no_dates(self):
s1 = Series(['a', 'A longer string'])
s2 = Series([1.0, 2.0], dtype=np.float64)
original = DataFrame({'s1': s1, 's2': s2})
original.index.name = 'index'
with tm.ensure_clean() as path:
original.to_stata(path)
written_and_read_again = self.read_dta(path)
tm.assert_frame_equal(written_and_read_again.set_index('index'),
original)

def test_large_value_conversion(self):
s0 = Series([1, 99], dtype=np.int8)
s1 = Series([1, 127], dtype=np.int8)
s2 = Series([1, 2 ** 15 - 1], dtype=np.int16)
s3 = Series([1, 2 ** 63 - 1], dtype=np.int64)
original = DataFrame({'s0': s0, 's1': s1, 's2': s2, 's3': s3})
original.index.name = 'index'
with tm.ensure_clean() as path:
with warnings.catch_warnings(record=True) as w:
tm.assert_produces_warning(original.to_stata(path),
PossiblePrecisionLoss)
# should produce a single warning
tm.assert_equal(len(w), 1)

written_and_read_again = self.read_dta(path)
modified = original.copy()
modified['s1'] = Series(modified['s1'], dtype=np.int16)
modified['s2'] = Series(modified['s2'], dtype=np.int32)
modified['s3'] = Series(modified['s3'], dtype=np.float64)
tm.assert_frame_equal(written_and_read_again.set_index('index'),
modified)

def test_dates_invalid_column(self):
original = DataFrame([datetime(2006, 11, 19, 23, 13, 20)])
original.index.name = 'index'
with tm.ensure_clean() as path:
with warnings.catch_warnings(record=True) as w:
tm.assert_produces_warning(original.to_stata(path, {0: 'tc'}),
InvalidColumnName)
tm.assert_equal(len(w), 1)

written_and_read_again = self.read_dta(path)
modified = original.copy()
modified.columns = ['_0']
tm.assert_frame_equal(written_and_read_again.set_index('index'),
modified)

def test_date_export_formats(self):
columns = ['tc', 'td', 'tw', 'tm', 'tq', 'th', 'ty']
conversions = dict(((c, c) for c in columns))
data = [datetime(2006, 11, 20, 23, 13, 20)] * len(columns)
original = DataFrame([data], columns=columns)
original.index.name = 'index'
expected_values = [datetime(2006, 11, 20, 23, 13, 20), # Time
datetime(2006, 11, 20), # Day
datetime(2006, 11, 19), # Week
datetime(2006, 11, 1), # Month
datetime(2006, 10, 1), # Quarter year
datetime(2006, 7, 1), # Half year
datetime(2006, 1, 1)] # Year

expected = DataFrame([expected_values], columns=columns)
expected.index.name = 'index'
with tm.ensure_clean() as path:
original.to_stata(path, conversions)
written_and_read_again = self.read_dta(path)
tm.assert_frame_equal(written_and_read_again.set_index('index'),
expected)


if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)