Skip to content

Commit 62aa9ec

Browse files
committed
Merge pull request #5525 from PKEuS/master
StataWriter: Replace non-isalnum characters in variable names by _ inste...
2 parents 68946db + c573630 commit 62aa9ec

File tree

3 files changed

+75
-0
lines changed

3 files changed

+75
-0
lines changed

doc/source/release.rst

+1
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ Improvements to existing features
221221
MultiIndex and Hierarchical Rows. Set the ``merge_cells`` to ``False`` to
222222
restore the previous behaviour. (:issue:`5254`)
223223
- The FRED DataReader now accepts multiple series (:issue`3413`)
224+
- StataWriter adjusts variable names to Stata's limitations
224225

225226
API Changes
226227
~~~~~~~~~~~

pandas/io/stata.py

+38
Original file line numberDiff line numberDiff line change
@@ -957,11 +957,49 @@ def _write_descriptors(self, typlist=None, varlist=None, srtlist=None,
957957
self._write(typ)
958958

959959
# varlist, length 33*nvar, char array, null terminated
960+
converted_names = []
961+
duplicate_var_id = 0
962+
for j, name in enumerate(self.varlist):
963+
orig_name = name
964+
# Replaces all characters disallowed in .dta format by their integral representation.
965+
for c in name:
966+
if (c < 'A' or c > 'Z') and (c < 'a' or c > 'z') and (c < '0' or c > '9') and c != '_':
967+
name = name.replace(c, '_')
968+
969+
# Variable name may not start with a number
970+
if name[0] > '0' and name[0] < '9':
971+
name = '_' + name
972+
973+
name = name[:min(len(name), 32)]
974+
975+
if not name == orig_name:
976+
# check for duplicates
977+
while self.varlist.count(name) > 0:
978+
# prepend ascending number to avoid duplicates
979+
name = '_' + str(duplicate_var_id) + name
980+
name = name[:min(len(name), 32)]
981+
duplicate_var_id += 1
982+
983+
converted_names.append('{0} -> {1}'.format(orig_name, name))
984+
self.varlist[j] = name
985+
960986
for name in self.varlist:
961987
name = self._null_terminate(name, True)
962988
name = _pad_bytes(name[:32], 33)
963989
self._write(name)
964990

991+
if converted_names:
992+
from warnings import warn
993+
warn("""Not all pandas column names were valid Stata variable names.
994+
Made the following replacements:
995+
996+
{0}
997+
998+
If this is not what you expect, please make sure you have Stata-compliant
999+
column names in your DataFrame (max 32 characters, only alphanumerics and
1000+
underscores)/
1001+
""".format('\n '.join(converted_names)))
1002+
9651003
# srtlist, 2*(nvar+1), int array, encoded by byteorder
9661004
srtlist = _pad_bytes("", (2*(nvar+1)))
9671005
self._write(srtlist)

pandas/io/tests/test_stata.py

+36
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,42 @@ def test_encoding(self):
231231
self.assert_(result == expected)
232232
self.assert_(isinstance(result, unicode))
233233

234+
def test_read_write_dta11(self):
235+
original = DataFrame([(1, 2, 3, 4)],
236+
columns=['good', 'bäd', '8number', 'astringwithmorethan32characters______'])
237+
if compat.PY3:
238+
formatted = DataFrame([(1, 2, 3, 4)],
239+
columns=['good', 'b_d', '_8number', 'astringwithmorethan32characters_'])
240+
else:
241+
formatted = DataFrame([(1, 2, 3, 4)],
242+
columns=['good', 'b__d', '_8number', 'astringwithmorethan32characters_'])
243+
formatted.index.name = 'index'
244+
245+
with tm.ensure_clean() as path:
246+
with warnings.catch_warnings(record=True) as w:
247+
original.to_stata(path, None, False)
248+
np.testing.assert_equal(
249+
len(w), 1) # should get a warning for that format.
250+
251+
written_and_read_again = self.read_dta(path)
252+
tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)
253+
254+
def test_read_write_dta12(self):
255+
original = DataFrame([(1, 2, 3, 4)],
256+
columns=['astringwithmorethan32characters_1', 'astringwithmorethan32characters_2', '+', '-'])
257+
formatted = DataFrame([(1, 2, 3, 4)],
258+
columns=['astringwithmorethan32characters_', '_0astringwithmorethan32character', '_', '_1_'])
259+
formatted.index.name = 'index'
260+
261+
with tm.ensure_clean() as path:
262+
with warnings.catch_warnings(record=True) as w:
263+
original.to_stata(path, None, False)
264+
np.testing.assert_equal(
265+
len(w), 1) # should get a warning for that format.
266+
267+
written_and_read_again = self.read_dta(path)
268+
tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)
269+
234270
if __name__ == '__main__':
235271
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
236272
exit=False)

0 commit comments

Comments
 (0)