diff --git a/doc/source/release.rst b/doc/source/release.rst index 5d40cbe82e87b..fef086f8c5f57 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -221,6 +221,7 @@ Improvements to existing features MultiIndex and Hierarchical Rows. Set the ``merge_cells`` to ``False`` to restore the previous behaviour. (:issue:`5254`) - The FRED DataReader now accepts multiple series (:issue`3413`) + - StataWriter adjusts variable names to Stata's limitations (:issue:`5709`) API Changes ~~~~~~~~~~~ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 8c172db162cd6..55bcbd76c2248 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1068,11 +1068,55 @@ def _write_descriptors(self, typlist=None, varlist=None, srtlist=None, self._write(typ) # varlist, length 33*nvar, char array, null terminated + converted_names = [] + duplicate_var_id = 0 + for j, name in enumerate(self.varlist): + orig_name = name + # Replaces all characters disallowed in .dta format by their integral representation. + for c in name: + if (c < 'A' or c > 'Z') and (c < 'a' or c > 'z') and (c < '0' or c > '9') and c != '_': + name = name.replace(c, '_') + + # Variable name may not start with a number + if name[0] > '0' and name[0] < '9': + name = '_' + name + + name = name[:min(len(name), 32)] + + if not name == orig_name: + # check for duplicates + while self.varlist.count(name) > 0: + # prepend ascending number to avoid duplicates + name = '_' + str(duplicate_var_id) + name + name = name[:min(len(name), 32)] + duplicate_var_id += 1 + + # need to possibly encode the orig name if its unicode + try: + orig_name = orig_name.encode('utf-8') + except: + pass + + converted_names.append('{0} -> {1}'.format(orig_name, name)) + self.varlist[j] = name + for name in self.varlist: name = self._null_terminate(name, True) name = _pad_bytes(name[:32], 33) self._write(name) + if converted_names: + from warnings import warn + warn("""Not all pandas column names were valid Stata variable names. + Made the following replacements: + + {0} + + If this is not what you expect, please make sure you have Stata-compliant + column names in your DataFrame (max 32 characters, only alphanumerics and + underscores)/ + """.format('\n '.join(converted_names))) + # srtlist, 2*(nvar+1), int array, encoded by byteorder srtlist = _pad_bytes("", (2*(nvar+1))) self._write(srtlist) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 76dae396c04ed..f75cf7ebb18d1 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -231,6 +231,38 @@ def test_encoding(self): self.assert_(result == expected) self.assert_(isinstance(result, unicode)) + def test_read_write_dta11(self): + original = DataFrame([(1, 2, 3, 4)], + columns=['good', compat.u('b\u00E4d'), '8number', 'astringwithmorethan32characters______']) + formatted = DataFrame([(1, 2, 3, 4)], + columns=['good', 'b_d', '_8number', 'astringwithmorethan32characters_']) + formatted.index.name = 'index' + + with tm.ensure_clean() as path: + with warnings.catch_warnings(record=True) as w: + original.to_stata(path, None, False) + np.testing.assert_equal( + len(w), 1) # should get a warning for that format. + + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) + + def test_read_write_dta12(self): + original = DataFrame([(1, 2, 3, 4)], + columns=['astringwithmorethan32characters_1', 'astringwithmorethan32characters_2', '+', '-']) + formatted = DataFrame([(1, 2, 3, 4)], + columns=['astringwithmorethan32characters_', '_0astringwithmorethan32character', '_', '_1_']) + formatted.index.name = 'index' + + with tm.ensure_clean() as path: + with warnings.catch_warnings(record=True) as w: + original.to_stata(path, None, False) + np.testing.assert_equal( + len(w), 1) # should get a warning for that format. + + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False)