Skip to content

Commit 88c4c55

Browse files
committed
BUG: NaN values not converted to Stata missing values
Stata does not correctly handle NaNs, and so these must be replaced with Stata missing values (. by default). The fix checks floating point columns for nan and replaces these with the Stata numeric code for (.). One of the code paths which writes files correctly handled this case, and this last-minute check was removed. The write_index option was also being ignored by omission. This has been fixed and numerous tests which were not correct have been fixed. Also contains some additional tests which were uncovered edges cases related to fix.
1 parent 66cf19a commit 88c4c55

File tree

4 files changed

+129
-22
lines changed

4 files changed

+129
-22
lines changed

doc/source/release.rst

+1
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,7 @@ Bug Fixes
269269
- Bug in ``DataFrame.to_stata`` when columns have non-string names (:issue:`4558`)
270270
- Bug in compat with ``np.compress``, surfaced in (:issue:`6658`)
271271
- Bug in binary operations with a rhs of a Series not aligning (:issue:`6681`)
272+
- Bug in ``DataFrame.to_stata`` which incorrectly handles nan values and ignores 'with_index' keyword argument (:issue:`6685`)
272273

273274
pandas 0.13.1
274275
-------------

pandas/core/frame.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1258,7 +1258,8 @@ def to_stata(
12581258
from pandas.io.stata import StataWriter
12591259
writer = StataWriter(fname, self, convert_dates=convert_dates,
12601260
encoding=encoding, byteorder=byteorder,
1261-
time_stamp=time_stamp, data_label=data_label)
1261+
time_stamp=time_stamp, data_label=data_label,
1262+
write_index=write_index)
12621263
writer.write_file()
12631264

12641265
@Appender(fmt.docstring_to_string, indents=1)

pandas/io/stata.py

+17-6
Original file line numberDiff line numberDiff line change
@@ -990,8 +990,6 @@ def _dtype_to_stata_type(dtype):
990990
return chr(255)
991991
elif dtype == np.float32:
992992
return chr(254)
993-
elif dtype == np.int64:
994-
return chr(253)
995993
elif dtype == np.int32:
996994
return chr(253)
997995
elif dtype == np.int16:
@@ -1025,8 +1023,6 @@ def _dtype_to_default_stata_fmt(dtype):
10251023
return "%10.0g"
10261024
elif dtype == np.float32:
10271025
return "%9.0g"
1028-
elif dtype == np.int64:
1029-
return "%9.0g"
10301026
elif dtype == np.int32:
10311027
return "%12.0g"
10321028
elif dtype == np.int8 or dtype == np.int16:
@@ -1108,6 +1104,21 @@ def _write(self, to_write):
11081104
self._file.write(to_write)
11091105

11101106

1107+
def _replace_nans(self, data):
1108+
# return data
1109+
"""Checks floating point data columns for nans, and replaces these with
1110+
the generic Stata for missing value (.)"""
1111+
for c in data:
1112+
dtype = data[c].dtype
1113+
if dtype in (np.float32, np.float64):
1114+
if dtype == np.float32:
1115+
replacement = self.MISSING_VALUES['f']
1116+
else:
1117+
replacement = self.MISSING_VALUES['d']
1118+
data[c] = data[c].fillna(replacement)
1119+
1120+
return data
1121+
11111122
def _check_column_names(self, data):
11121123
"""Checks column names to ensure that they are valid Stata column names.
11131124
This includes checks for:
@@ -1197,6 +1208,8 @@ def __iter__(self):
11971208
data = _cast_to_stata_types(data)
11981209
# Ensure column names are strings
11991210
data = self._check_column_names(data)
1211+
# Replace NaNs with Stata missing values
1212+
data = self._replace_nans(data)
12001213
self.datarows = DataFrameRowIter(data)
12011214
self.nobs, self.nvar = data.shape
12021215
self.data = data
@@ -1340,8 +1353,6 @@ def _write_data_dates(self):
13401353
var = _pad_bytes(var, typ)
13411354
self._write(var)
13421355
else:
1343-
if isnull(var): # this only matters for floats
1344-
var = MISSING_VALUES[TYPE_MAP[typ]]
13451356
self._file.write(struct.pack(byteorder+TYPE_MAP[typ], var))
13461357

13471358
def _null_terminate(self, s, as_string=False):

pandas/io/tests/test_stata.py

+109-15
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@
1313
import pandas as pd
1414
from pandas.core.frame import DataFrame, Series
1515
from pandas.io.parsers import read_csv
16-
from pandas.io.stata import read_stata, StataReader, InvalidColumnName
16+
from pandas.io.stata import (read_stata, StataReader, InvalidColumnName,
17+
PossiblePrecisionLoss)
1718
import pandas.util.testing as tm
1819
from pandas.util.misc import is_little_endian
1920
from pandas import compat
@@ -142,8 +143,7 @@ def test_read_dta2(self):
142143
parsed_117 = self.read_dta(self.dta2_117)
143144
# 113 is buggy due ot limits date format support in Stata
144145
# parsed_113 = self.read_dta(self.dta2_113)
145-
146-
np.testing.assert_equal(
146+
tm.assert_equal(
147147
len(w), 1) # should get a warning for that format.
148148

149149
# buggy test because of the NaT comparison on certain platforms
@@ -206,7 +206,7 @@ def test_read_write_dta5(self):
206206
original.index.name = 'index'
207207

208208
with tm.ensure_clean() as path:
209-
original.to_stata(path, None, False)
209+
original.to_stata(path, None)
210210
written_and_read_again = self.read_dta(path)
211211
tm.assert_frame_equal(written_and_read_again.set_index('index'),
212212
original)
@@ -221,7 +221,7 @@ def test_write_dta6(self):
221221
original['quarter'] = original['quarter'].astype(np.int32)
222222

223223
with tm.ensure_clean() as path:
224-
original.to_stata(path, None, False)
224+
original.to_stata(path, None)
225225
written_and_read_again = self.read_dta(path)
226226
tm.assert_frame_equal(written_and_read_again.set_index('index'),
227227
original)
@@ -257,7 +257,7 @@ def test_read_write_dta10(self):
257257
original['integer'] = original['integer'].astype(np.int32)
258258

259259
with tm.ensure_clean() as path:
260-
original.to_stata(path, {'datetime': 'tc'}, False)
260+
original.to_stata(path, {'datetime': 'tc'})
261261
written_and_read_again = self.read_dta(path)
262262
tm.assert_frame_equal(written_and_read_again.set_index('index'),
263263
original)
@@ -295,9 +295,9 @@ def test_read_write_dta11(self):
295295

296296
with tm.ensure_clean() as path:
297297
with warnings.catch_warnings(record=True) as w:
298-
original.to_stata(path, None, False)
299-
np.testing.assert_equal(
300-
len(w), 1) # should get a warning for that format.
298+
original.to_stata(path, None)
299+
# should get a warning for that format.
300+
tm.assert_equal(len(w), 1)
301301

302302
written_and_read_again = self.read_dta(path)
303303
tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)
@@ -324,13 +324,12 @@ def test_read_write_dta12(self):
324324

325325
with tm.ensure_clean() as path:
326326
with warnings.catch_warnings(record=True) as w:
327-
original.to_stata(path, None, False)
328-
np.testing.assert_equal(
329-
len(w), 1) # should get a warning for that format.
327+
original.to_stata(path, None)
328+
tm.assert_equal(len(w), 1) # should get a warning for that format.
330329

331330
written_and_read_again = self.read_dta(path)
332331
tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)
333-
332+
334333
def test_read_write_dta13(self):
335334
s1 = Series(2**9, dtype=np.int16)
336335
s2 = Series(2**17, dtype=np.int32)
@@ -366,7 +365,7 @@ def test_read_write_reread_dta14(self):
366365
tm.assert_frame_equal(parsed_114, parsed_115)
367366

368367
with tm.ensure_clean() as path:
369-
parsed_114.to_stata(path, {'date_td': 'td'}, write_index=False)
368+
parsed_114.to_stata(path, {'date_td': 'td'})
370369
written_and_read_again = self.read_dta(path)
371370
tm.assert_frame_equal(written_and_read_again.set_index('index'), parsed_114)
372371

@@ -406,7 +405,7 @@ def test_numeric_column_names(self):
406405
with warnings.catch_warnings(record=True) as w:
407406
tm.assert_produces_warning(original.to_stata(path), InvalidColumnName)
408407
# should produce a single warning
409-
np.testing.assert_equal(len(w), 1)
408+
tm.assert_equal(len(w), 1)
410409

411410
written_and_read_again = self.read_dta(path)
412411
written_and_read_again = written_and_read_again.set_index('index')
@@ -415,7 +414,102 @@ def test_numeric_column_names(self):
415414
written_and_read_again.columns = map(convert_col_name, columns)
416415
tm.assert_frame_equal(original, written_and_read_again)
417416

417+
def test_nan_to_missing_value(self):
418+
s1 = Series(np.arange(4.0), dtype=np.float32)
419+
s2 = Series(np.arange(4.0), dtype=np.float64)
420+
s1[::2] = np.nan
421+
s2[1::2] = np.nan
422+
original = DataFrame({'s1': s1, 's2': s2})
423+
original.index.name = 'index'
424+
with tm.ensure_clean() as path:
425+
original.to_stata(path)
426+
written_and_read_again = self.read_dta(path)
427+
written_and_read_again = written_and_read_again.set_index('index')
428+
tm.assert_frame_equal(written_and_read_again, original)
429+
430+
def test_no_index(self):
431+
columns = ['x', 'y']
432+
original = DataFrame(np.reshape(np.arange(10.0), (5, 2)),
433+
columns=columns)
434+
original.index.name = 'index_not_written'
435+
with tm.ensure_clean() as path:
436+
original.to_stata(path, write_index=False)
437+
written_and_read_again = self.read_dta(path)
438+
tm.assertRaises(KeyError,
439+
lambda: written_and_read_again['index_not_written'])
440+
441+
def test_string_no_dates(self):
442+
s1 = Series(['a', 'A longer string'])
443+
s2 = Series([1.0, 2.0], dtype=np.float64)
444+
original = DataFrame({'s1': s1, 's2': s2})
445+
original.index.name = 'index'
446+
with tm.ensure_clean() as path:
447+
original.to_stata(path)
448+
written_and_read_again = self.read_dta(path)
449+
tm.assert_frame_equal(written_and_read_again.set_index('index'),
450+
original)
451+
452+
def test_large_value_conversion(self):
453+
s0 = Series([1, 99], dtype=np.int8)
454+
s1 = Series([1, 127], dtype=np.int8)
455+
s2 = Series([1, 2 ** 15 - 1], dtype=np.int16)
456+
s3 = Series([1, 2 ** 63 - 1], dtype=np.int64)
457+
original = DataFrame({'s0': s0, 's1': s1, 's2': s2, 's3': s3})
458+
original.index.name = 'index'
459+
with tm.ensure_clean() as path:
460+
with warnings.catch_warnings(record=True) as w:
461+
tm.assert_produces_warning(original.to_stata(path),
462+
PossiblePrecisionLoss)
463+
# should produce a single warning
464+
tm.assert_equal(len(w), 1)
465+
466+
written_and_read_again = self.read_dta(path)
467+
modified = original.copy()
468+
modified['s1'] = Series(modified['s1'], dtype=np.int16)
469+
modified['s2'] = Series(modified['s2'], dtype=np.int32)
470+
modified['s3'] = Series(modified['s3'], dtype=np.float64)
471+
tm.assert_frame_equal(written_and_read_again.set_index('index'),
472+
modified)
473+
474+
def test_dates_invalid_column(self):
475+
original = DataFrame([datetime(2006, 11, 19, 23, 13, 20)])
476+
original.index.name = 'index'
477+
with tm.ensure_clean() as path:
478+
with warnings.catch_warnings(record=True) as w:
479+
tm.assert_produces_warning(original.to_stata(path, {0: 'tc'}),
480+
InvalidColumnName)
481+
tm.assert_equal(len(w), 1)
482+
483+
written_and_read_again = self.read_dta(path)
484+
modified = original.copy()
485+
modified.columns = ['_0']
486+
tm.assert_frame_equal(written_and_read_again.set_index('index'),
487+
modified)
488+
489+
def test_date_export_formats(self):
490+
columns = ['tc', 'td', 'tw', 'tm', 'tq', 'th', 'ty']
491+
conversions = dict(((c, c) for c in columns))
492+
data = [datetime(2006, 11, 20, 23, 13, 20)] * len(columns)
493+
original = DataFrame([data], columns=columns)
494+
original.index.name = 'index'
495+
expected_values = [datetime(2006, 11, 20, 23, 13, 20), # Time
496+
datetime(2006, 11, 20), # Day
497+
datetime(2006, 11, 19), # Week
498+
datetime(2006, 11, 1), # Month
499+
datetime(2006, 10, 1), # Quarter year
500+
datetime(2006, 7, 1), # Half year
501+
datetime(2006, 1, 1)] # Year
502+
503+
expected = DataFrame([expected_values], columns=columns)
504+
expected.index.name = 'index'
505+
with tm.ensure_clean() as path:
506+
original.to_stata(path, conversions)
507+
written_and_read_again = self.read_dta(path)
508+
tm.assert_frame_equal(written_and_read_again.set_index('index'),
509+
expected)
510+
418511

419512
if __name__ == '__main__':
420513
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
421514
exit=False)
515+

0 commit comments

Comments
 (0)