Skip to content

Commit 2baaefe

Browse files
committed
Merge pull request #8564 from bashtage/stata-datatype-conversion
ENH: Add type conversion to read_stata and StataReader
2 parents ec31259 + 396e7f3 commit 2baaefe

File tree

4 files changed

+62
-8
lines changed

4 files changed

+62
-8
lines changed

doc/source/io.rst

+6-1
Original file line numberDiff line numberDiff line change
@@ -3654,10 +3654,15 @@ missing values are represented as ``np.nan``. If ``True``, missing values are
36543654
represented using ``StataMissingValue`` objects, and columns containing missing
36553655
values will have ``dtype`` set to ``object``.
36563656

3657-
36583657
The StataReader supports .dta Formats 104, 105, 108, 113-115 and 117.
36593658
Alternatively, the function :func:`~pandas.io.stata.read_stata` can be used
36603659

3660+
.. note::
3661+
3662+
Setting ``preserve_dtypes=False`` will upcast all integer data types to
3663+
``int64`` and all floating point data types to ``float64``. By default,
3664+
the Stata data types are preserved when importing.
3665+
36613666
.. ipython:: python
36623667
:suppress:
36633668

doc/source/v0.15.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -864,6 +864,7 @@ Enhancements
864864
- Added support for writing datetime64 columns with ``to_sql`` for all database flavors (:issue:`7103`).
865865

866866
- Added support for bool, uint8, uint16 and uint32 datatypes in ``to_stata`` (:issue:`7097`, :issue:`7365`)
867+
- Added conversion option when importing Stata files (:issue:`8527`)
867868

868869
- Added ``layout`` keyword to ``DataFrame.plot``. You can pass a tuple of ``(rows, columns)``, one of which can be ``-1`` to automatically infer (:issue:`6667`, :issue:`8071`).
869870
- Allow to pass multiple axes to ``DataFrame.plot``, ``hist`` and ``boxplot`` (:issue:`5353`, :issue:`6970`, :issue:`7069`)

pandas/io/stata.py

+25-7
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929

3030
def read_stata(filepath_or_buffer, convert_dates=True,
3131
convert_categoricals=True, encoding=None, index=None,
32-
convert_missing=False):
32+
convert_missing=False, preserve_dtypes=True):
3333
"""
3434
Read Stata file into DataFrame
3535
@@ -52,13 +52,14 @@ def read_stata(filepath_or_buffer, convert_dates=True,
5252
If True, columns containing missing values are returned with
5353
object data types and missing values are represented by
5454
StataMissingValue objects.
55+
preserve_dtypes : boolean, defaults to True
56+
Preserve Stata datatypes. If False, numeric data are upcast to pandas
57+
default types for foreign data (float64 or int64)
5558
"""
5659
reader = StataReader(filepath_or_buffer, encoding)
5760

58-
return reader.data(convert_dates,
59-
convert_categoricals,
60-
index,
61-
convert_missing)
61+
return reader.data(convert_dates, convert_categoricals, index,
62+
convert_missing, preserve_dtypes)
6263

6364
_date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"]
6465

@@ -976,7 +977,7 @@ def _read_strls(self):
976977
self.path_or_buf.read(1) # zero-termination
977978

978979
def data(self, convert_dates=True, convert_categoricals=True, index=None,
979-
convert_missing=False):
980+
convert_missing=False, preserve_dtypes=True):
980981
"""
981982
Reads observations from Stata file, converting them into a dataframe
982983
@@ -995,7 +996,9 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None,
995996
nans. If True, columns containing missing values are returned with
996997
object data types and missing values are represented by
997998
StataMissingValue objects.
998-
999+
preserve_dtypes : boolean, defaults to True
1000+
Preserve Stata datatypes. If False, numeric data are upcast to
1001+
pandas default types for foreign data (float64 or int64)
9991002
Returns
10001003
-------
10011004
y : DataFrame instance
@@ -1107,6 +1110,21 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None,
11071110
labeled_data[(data[col] == k).values] = v
11081111
data[col] = Categorical.from_array(labeled_data)
11091112

1113+
if not preserve_dtypes:
1114+
retyped_data = []
1115+
convert = False
1116+
for col in data:
1117+
dtype = data[col].dtype
1118+
if dtype in (np.float16, np.float32):
1119+
dtype = np.float64
1120+
convert = True
1121+
elif dtype in (np.int8, np.int16, np.int32):
1122+
dtype = np.int64
1123+
convert = True
1124+
retyped_data.append((col, data[col].astype(dtype)))
1125+
if convert:
1126+
data = DataFrame.from_items(retyped_data)
1127+
11101128
return data
11111129

11121130
def data_label(self):

pandas/io/tests/test_stata.py

+30
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ def setUp(self):
8383

8484

8585
def read_dta(self, file):
86+
# Legacy default reader configuration
8687
return read_stata(file, convert_dates=True)
8788

8889
def read_csv(self, file):
@@ -694,6 +695,35 @@ def test_big_dates(self):
694695
tm.assert_frame_equal(written_and_read_again.set_index('index'),
695696
expected)
696697

698+
def test_dtype_conversion(self):
699+
expected = self.read_csv(self.csv15)
700+
expected['byte_'] = expected['byte_'].astype(np.int8)
701+
expected['int_'] = expected['int_'].astype(np.int16)
702+
expected['long_'] = expected['long_'].astype(np.int32)
703+
expected['float_'] = expected['float_'].astype(np.float32)
704+
expected['double_'] = expected['double_'].astype(np.float64)
705+
expected['date_td'] = expected['date_td'].apply(datetime.strptime,
706+
args=('%Y-%m-%d',))
707+
708+
no_conversion = read_stata(self.dta15_117,
709+
convert_dates=True)
710+
tm.assert_frame_equal(expected, no_conversion)
711+
712+
conversion = read_stata(self.dta15_117,
713+
convert_dates=True,
714+
preserve_dtypes=False)
715+
716+
# read_csv types are the same
717+
expected = self.read_csv(self.csv15)
718+
expected['date_td'] = expected['date_td'].apply(datetime.strptime,
719+
args=('%Y-%m-%d',))
720+
721+
tm.assert_frame_equal(expected, conversion)
722+
723+
724+
725+
726+
697727

698728

699729

0 commit comments

Comments
 (0)