Skip to content

Commit 83b1ce4

Browse files
committed
Merge pull request pandas-dev#6685 from bashtage/stata-world-indicators
BUG: NaN values not converted to Stata missing values (GH6684)
2 parents 66cf19a + 88c4c55 commit 83b1ce4

File tree

4 files changed

+129
-22
lines changed

4 files changed

+129
-22
lines changed

doc/source/release.rst

+1
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,7 @@ Bug Fixes
269269
- Bug in ``DataFrame.to_stata`` when columns have non-string names (:issue:`4558`)
270270
- Bug in compat with ``np.compress``, surfaced in (:issue:`6658`)
271271
- Bug in binary operations with a rhs of a Series not aligning (:issue:`6681`)
272+
- Bug in ``DataFrame.to_stata`` which incorrectly handles nan values and ignores 'with_index' keyword argument (:issue:`6685`)
272273

273274
pandas 0.13.1
274275
-------------

pandas/core/frame.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1258,7 +1258,8 @@ def to_stata(
12581258
from pandas.io.stata import StataWriter
12591259
writer = StataWriter(fname, self, convert_dates=convert_dates,
12601260
encoding=encoding, byteorder=byteorder,
1261-
time_stamp=time_stamp, data_label=data_label)
1261+
time_stamp=time_stamp, data_label=data_label,
1262+
write_index=write_index)
12621263
writer.write_file()
12631264

12641265
@Appender(fmt.docstring_to_string, indents=1)

pandas/io/stata.py

+17-6
Original file line numberDiff line numberDiff line change
@@ -990,8 +990,6 @@ def _dtype_to_stata_type(dtype):
990990
return chr(255)
991991
elif dtype == np.float32:
992992
return chr(254)
993-
elif dtype == np.int64:
994-
return chr(253)
995993
elif dtype == np.int32:
996994
return chr(253)
997995
elif dtype == np.int16:
@@ -1025,8 +1023,6 @@ def _dtype_to_default_stata_fmt(dtype):
10251023
return "%10.0g"
10261024
elif dtype == np.float32:
10271025
return "%9.0g"
1028-
elif dtype == np.int64:
1029-
return "%9.0g"
10301026
elif dtype == np.int32:
10311027
return "%12.0g"
10321028
elif dtype == np.int8 or dtype == np.int16:
@@ -1108,6 +1104,21 @@ def _write(self, to_write):
11081104
self._file.write(to_write)
11091105

11101106

1107+
def _replace_nans(self, data):
1108+
# return data
1109+
"""Checks floating point data columns for nans, and replaces these with
1110+
the generic Stata for missing value (.)"""
1111+
for c in data:
1112+
dtype = data[c].dtype
1113+
if dtype in (np.float32, np.float64):
1114+
if dtype == np.float32:
1115+
replacement = self.MISSING_VALUES['f']
1116+
else:
1117+
replacement = self.MISSING_VALUES['d']
1118+
data[c] = data[c].fillna(replacement)
1119+
1120+
return data
1121+
11111122
def _check_column_names(self, data):
11121123
"""Checks column names to ensure that they are valid Stata column names.
11131124
This includes checks for:
@@ -1197,6 +1208,8 @@ def __iter__(self):
11971208
data = _cast_to_stata_types(data)
11981209
# Ensure column names are strings
11991210
data = self._check_column_names(data)
1211+
# Replace NaNs with Stata missing values
1212+
data = self._replace_nans(data)
12001213
self.datarows = DataFrameRowIter(data)
12011214
self.nobs, self.nvar = data.shape
12021215
self.data = data
@@ -1340,8 +1353,6 @@ def _write_data_dates(self):
13401353
var = _pad_bytes(var, typ)
13411354
self._write(var)
13421355
else:
1343-
if isnull(var): # this only matters for floats
1344-
var = MISSING_VALUES[TYPE_MAP[typ]]
13451356
self._file.write(struct.pack(byteorder+TYPE_MAP[typ], var))
13461357

13471358
def _null_terminate(self, s, as_string=False):

pandas/io/tests/test_stata.py

+109-15
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@
1313
import pandas as pd
1414
from pandas.core.frame import DataFrame, Series
1515
from pandas.io.parsers import read_csv
16-
from pandas.io.stata import read_stata, StataReader, InvalidColumnName
16+
from pandas.io.stata import (read_stata, StataReader, InvalidColumnName,
17+
PossiblePrecisionLoss)
1718
import pandas.util.testing as tm
1819
from pandas.util.misc import is_little_endian
1920
from pandas import compat
@@ -142,8 +143,7 @@ def test_read_dta2(self):
142143
parsed_117 = self.read_dta(self.dta2_117)
143144
# 113 is buggy due ot limits date format support in Stata
144145
# parsed_113 = self.read_dta(self.dta2_113)
145-
146-
np.testing.assert_equal(
146+
tm.assert_equal(
147147
len(w), 1) # should get a warning for that format.
148148

149149
# buggy test because of the NaT comparison on certain platforms
@@ -206,7 +206,7 @@ def test_read_write_dta5(self):
206206
original.index.name = 'index'
207207

208208
with tm.ensure_clean() as path:
209-
original.to_stata(path, None, False)
209+
original.to_stata(path, None)
210210
written_and_read_again = self.read_dta(path)
211211
tm.assert_frame_equal(written_and_read_again.set_index('index'),
212212
original)
@@ -221,7 +221,7 @@ def test_write_dta6(self):
221221
original['quarter'] = original['quarter'].astype(np.int32)
222222

223223
with tm.ensure_clean() as path:
224-
original.to_stata(path, None, False)
224+
original.to_stata(path, None)
225225
written_and_read_again = self.read_dta(path)
226226
tm.assert_frame_equal(written_and_read_again.set_index('index'),
227227
original)
@@ -257,7 +257,7 @@ def test_read_write_dta10(self):
257257
original['integer'] = original['integer'].astype(np.int32)
258258

259259
with tm.ensure_clean() as path:
260-
original.to_stata(path, {'datetime': 'tc'}, False)
260+
original.to_stata(path, {'datetime': 'tc'})
261261
written_and_read_again = self.read_dta(path)
262262
tm.assert_frame_equal(written_and_read_again.set_index('index'),
263263
original)
@@ -295,9 +295,9 @@ def test_read_write_dta11(self):
295295

296296
with tm.ensure_clean() as path:
297297
with warnings.catch_warnings(record=True) as w:
298-
original.to_stata(path, None, False)
299-
np.testing.assert_equal(
300-
len(w), 1) # should get a warning for that format.
298+
original.to_stata(path, None)
299+
# should get a warning for that format.
300+
tm.assert_equal(len(w), 1)
301301

302302
written_and_read_again = self.read_dta(path)
303303
tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)
@@ -324,13 +324,12 @@ def test_read_write_dta12(self):
324324

325325
with tm.ensure_clean() as path:
326326
with warnings.catch_warnings(record=True) as w:
327-
original.to_stata(path, None, False)
328-
np.testing.assert_equal(
329-
len(w), 1) # should get a warning for that format.
327+
original.to_stata(path, None)
328+
tm.assert_equal(len(w), 1) # should get a warning for that format.
330329

331330
written_and_read_again = self.read_dta(path)
332331
tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)
333-
332+
334333
def test_read_write_dta13(self):
335334
s1 = Series(2**9, dtype=np.int16)
336335
s2 = Series(2**17, dtype=np.int32)
@@ -366,7 +365,7 @@ def test_read_write_reread_dta14(self):
366365
tm.assert_frame_equal(parsed_114, parsed_115)
367366

368367
with tm.ensure_clean() as path:
369-
parsed_114.to_stata(path, {'date_td': 'td'}, write_index=False)
368+
parsed_114.to_stata(path, {'date_td': 'td'})
370369
written_and_read_again = self.read_dta(path)
371370
tm.assert_frame_equal(written_and_read_again.set_index('index'), parsed_114)
372371

@@ -406,7 +405,7 @@ def test_numeric_column_names(self):
406405
with warnings.catch_warnings(record=True) as w:
407406
tm.assert_produces_warning(original.to_stata(path), InvalidColumnName)
408407
# should produce a single warning
409-
np.testing.assert_equal(len(w), 1)
408+
tm.assert_equal(len(w), 1)
410409

411410
written_and_read_again = self.read_dta(path)
412411
written_and_read_again = written_and_read_again.set_index('index')
@@ -415,7 +414,102 @@ def test_numeric_column_names(self):
415414
written_and_read_again.columns = map(convert_col_name, columns)
416415
tm.assert_frame_equal(original, written_and_read_again)
417416

417+
def test_nan_to_missing_value(self):
418+
s1 = Series(np.arange(4.0), dtype=np.float32)
419+
s2 = Series(np.arange(4.0), dtype=np.float64)
420+
s1[::2] = np.nan
421+
s2[1::2] = np.nan
422+
original = DataFrame({'s1': s1, 's2': s2})
423+
original.index.name = 'index'
424+
with tm.ensure_clean() as path:
425+
original.to_stata(path)
426+
written_and_read_again = self.read_dta(path)
427+
written_and_read_again = written_and_read_again.set_index('index')
428+
tm.assert_frame_equal(written_and_read_again, original)
429+
430+
def test_no_index(self):
431+
columns = ['x', 'y']
432+
original = DataFrame(np.reshape(np.arange(10.0), (5, 2)),
433+
columns=columns)
434+
original.index.name = 'index_not_written'
435+
with tm.ensure_clean() as path:
436+
original.to_stata(path, write_index=False)
437+
written_and_read_again = self.read_dta(path)
438+
tm.assertRaises(KeyError,
439+
lambda: written_and_read_again['index_not_written'])
440+
441+
def test_string_no_dates(self):
442+
s1 = Series(['a', 'A longer string'])
443+
s2 = Series([1.0, 2.0], dtype=np.float64)
444+
original = DataFrame({'s1': s1, 's2': s2})
445+
original.index.name = 'index'
446+
with tm.ensure_clean() as path:
447+
original.to_stata(path)
448+
written_and_read_again = self.read_dta(path)
449+
tm.assert_frame_equal(written_and_read_again.set_index('index'),
450+
original)
451+
452+
def test_large_value_conversion(self):
453+
s0 = Series([1, 99], dtype=np.int8)
454+
s1 = Series([1, 127], dtype=np.int8)
455+
s2 = Series([1, 2 ** 15 - 1], dtype=np.int16)
456+
s3 = Series([1, 2 ** 63 - 1], dtype=np.int64)
457+
original = DataFrame({'s0': s0, 's1': s1, 's2': s2, 's3': s3})
458+
original.index.name = 'index'
459+
with tm.ensure_clean() as path:
460+
with warnings.catch_warnings(record=True) as w:
461+
tm.assert_produces_warning(original.to_stata(path),
462+
PossiblePrecisionLoss)
463+
# should produce a single warning
464+
tm.assert_equal(len(w), 1)
465+
466+
written_and_read_again = self.read_dta(path)
467+
modified = original.copy()
468+
modified['s1'] = Series(modified['s1'], dtype=np.int16)
469+
modified['s2'] = Series(modified['s2'], dtype=np.int32)
470+
modified['s3'] = Series(modified['s3'], dtype=np.float64)
471+
tm.assert_frame_equal(written_and_read_again.set_index('index'),
472+
modified)
473+
474+
def test_dates_invalid_column(self):
475+
original = DataFrame([datetime(2006, 11, 19, 23, 13, 20)])
476+
original.index.name = 'index'
477+
with tm.ensure_clean() as path:
478+
with warnings.catch_warnings(record=True) as w:
479+
tm.assert_produces_warning(original.to_stata(path, {0: 'tc'}),
480+
InvalidColumnName)
481+
tm.assert_equal(len(w), 1)
482+
483+
written_and_read_again = self.read_dta(path)
484+
modified = original.copy()
485+
modified.columns = ['_0']
486+
tm.assert_frame_equal(written_and_read_again.set_index('index'),
487+
modified)
488+
489+
def test_date_export_formats(self):
490+
columns = ['tc', 'td', 'tw', 'tm', 'tq', 'th', 'ty']
491+
conversions = dict(((c, c) for c in columns))
492+
data = [datetime(2006, 11, 20, 23, 13, 20)] * len(columns)
493+
original = DataFrame([data], columns=columns)
494+
original.index.name = 'index'
495+
expected_values = [datetime(2006, 11, 20, 23, 13, 20), # Time
496+
datetime(2006, 11, 20), # Day
497+
datetime(2006, 11, 19), # Week
498+
datetime(2006, 11, 1), # Month
499+
datetime(2006, 10, 1), # Quarter year
500+
datetime(2006, 7, 1), # Half year
501+
datetime(2006, 1, 1)] # Year
502+
503+
expected = DataFrame([expected_values], columns=columns)
504+
expected.index.name = 'index'
505+
with tm.ensure_clean() as path:
506+
original.to_stata(path, conversions)
507+
written_and_read_again = self.read_dta(path)
508+
tm.assert_frame_equal(written_and_read_again.set_index('index'),
509+
expected)
510+
418511

419512
if __name__ == '__main__':
420513
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
421514
exit=False)
515+

0 commit comments

Comments
 (0)