Skip to content

Commit 7745d72

Browse files
bashtagejreback
authored andcommitted
ENH: Add uint and bool support in to_stata
Added support for uint (uint8, uint16 and uint32, but not uint64) and bool datatypes in to_stata. Added an explanation of supported data types in io.rst. closes #7097 and closes #7365
1 parent 27b2b7d commit 7745d72

File tree

4 files changed

+80
-5
lines changed

4 files changed

+80
-5
lines changed

doc/source/io.rst

+25
Original file line numberDiff line numberDiff line change
@@ -3504,6 +3504,31 @@ into a .dta file. The format version of this file is always 115 (Stata 12).
35043504
df = DataFrame(randn(10, 2), columns=list('AB'))
35053505
df.to_stata('stata.dta')
35063506
3507+
*Stata* data files have limited data type support; only strings with 244 or
3508+
fewer characters, ``int8``, ``int16``, ``int32`` and ``float64`` can be stored
3509+
in ``.dta`` files. *Stata* reserves certain values to represent
3510+
missing data. Furthermore, when a value is encountered outside of the
3511+
permitted range, the data type is upcast to the next larger size. For
3512+
example, ``int8`` values are restricted to lie between -127 and 100, and so
3513+
variables with values above 100 will trigger a conversion to ``int16``. ``nan``
3514+
values in floating points data types are stored as the basic missing data type
3515+
(``.`` in *Stata*). It is not possible to indicate missing data values for
3516+
integer data types.
3517+
3518+
The *Stata* writer gracefully handles other data types including ``int64``,
3519+
``bool``, ``uint8``, ``uint16``, ``uint32`` and ``float32`` by upcasting to
3520+
the smallest supported type that can represent the data. For example, data
3521+
with a type of ``uint8`` will be cast to ``int8`` if all values are less than
3522+
100 (the upper bound for non-missing ``int8`` data in *Stata*), or, if values are
3523+
outside of this range, the data is cast to ``int16``.
3524+
3525+
3526+
.. warning::
3527+
3528+
Conversion from ``int64`` to ``float64`` may result in a loss of precision
3529+
if ``int64`` values are larger than 2**53.
3530+
3531+
35073532
.. _io.stata_reader:
35083533

35093534
Reading from STATA format

doc/source/v0.15.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ Known Issues
114114

115115
Enhancements
116116
~~~~~~~~~~~~
117-
117+
- Added support for bool, uint8, uint16 and uint32 datatypes in ``to_stata`` (:issue:`7097`, :issue:`7365`)
118118

119119

120120

pandas/io/stata.py

+31-4
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,7 @@ class InvalidColumnName(Warning):
206206
underscores, no Stata reserved words)
207207
"""
208208

209+
209210
def _cast_to_stata_types(data):
210211
"""Checks the dtypes of the columns of a pandas DataFrame for
211212
compatibility with the data types and ranges supported by Stata, and
@@ -218,18 +219,44 @@ def _cast_to_stata_types(data):
218219
219220
Notes
220221
-----
221-
Numeric columns must be one of int8, int16, int32, float32 or float64, with
222-
some additional value restrictions on the integer data types. int8 and
223-
int16 columns are checked for violations of the value restrictions and
222+
Numeric columns in Stata must be one of int8, int16, int32, float32 or
223+
float64, with some additional value restrictions. int8 and int16 columns
224+
are checked for violations of the value restrictions and
224225
upcast if needed. int64 data is not usable in Stata, and so it is
225226
downcast to int32 whenever the value are in the int32 range, and
226227
sidecast to float64 when larger than this range. If the int64 values
227228
are outside of the range of those perfectly representable as float64 values,
228229
a warning is raised.
230+
231+
bool columns are cast to int8. uint colums are converted to int of the same
232+
size if there is no loss in precision, other wise are upcast to a larger
233+
type. uint64 is currently not supported since it is concerted to object in
234+
a DataFrame.
229235
"""
230236
ws = ''
237+
# original, if small, if large
238+
conversion_data = ((np.bool, np.int8, np.int8),
239+
(np.uint8, np.int8, np.int16),
240+
(np.uint16, np.int16, np.int32),
241+
(np.uint32, np.int32, np.int64))
242+
231243
for col in data:
232244
dtype = data[col].dtype
245+
# Cast from unsupported types to supported types
246+
for c_data in conversion_data:
247+
if dtype == c_data[0]:
248+
if data[col].max() <= np.iinfo(c_data[1]).max:
249+
dtype = c_data[1]
250+
else:
251+
dtype = c_data[2]
252+
if c_data[2] == np.float64: # Warn if necessary
253+
if data[col].max() >= 2 * 53:
254+
ws = precision_loss_doc % ('uint64', 'float64')
255+
256+
data[col] = data[col].astype(dtype)
257+
258+
259+
# Check values and upcast if necessary
233260
if dtype == np.int8:
234261
if data[col].max() > 100 or data[col].min() < -127:
235262
data[col] = data[col].astype(np.int16)
@@ -241,7 +268,7 @@ def _cast_to_stata_types(data):
241268
data[col] = data[col].astype(np.int32)
242269
else:
243270
data[col] = data[col].astype(np.float64)
244-
if data[col].max() <= 2 * 53 or data[col].min() >= -2 ** 53:
271+
if data[col].max() >= 2 ** 53 or data[col].min() <= -2 ** 53:
245272
ws = precision_loss_doc % ('int64', 'float64')
246273

247274
if ws:

pandas/io/tests/test_stata.py

+23
Original file line numberDiff line numberDiff line change
@@ -527,6 +527,29 @@ def test_write_missing_strings(self):
527527
tm.assert_frame_equal(written_and_read_again.set_index('index'),
528528
expected)
529529

530+
def test_bool_uint(self):
531+
s0 = Series([0, 1, True], dtype=np.bool)
532+
s1 = Series([0, 1, 100], dtype=np.uint8)
533+
s2 = Series([0, 1, 255], dtype=np.uint8)
534+
s3 = Series([0, 1, 2 ** 15 - 100], dtype=np.uint16)
535+
s4 = Series([0, 1, 2 ** 16 - 1], dtype=np.uint16)
536+
s5 = Series([0, 1, 2 ** 31 - 100], dtype=np.uint32)
537+
s6 = Series([0, 1, 2 ** 32 - 1], dtype=np.uint32)
538+
539+
original = DataFrame({'s0': s0, 's1': s1, 's2': s2, 's3': s3,
540+
's4': s4, 's5': s5, 's6': s6})
541+
original.index.name = 'index'
542+
expected = original.copy()
543+
expected_types = (np.int8, np.int8, np.int16, np.int16, np.int32,
544+
np.int32, np.float64)
545+
for c, t in zip(expected.columns, expected_types):
546+
expected[c] = expected[c].astype(t)
547+
548+
with tm.ensure_clean() as path:
549+
original.to_stata(path)
550+
written_and_read_again = self.read_dta(path)
551+
written_and_read_again = written_and_read_again.set_index('index')
552+
tm.assert_frame_equal(written_and_read_again, expected)
530553

531554
if __name__ == '__main__':
532555
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

0 commit comments

Comments
 (0)