Skip to content

Commit f6fd509

Browse files
committed
Merge pull request #5709 from PKEuS/master
StataWriter: Replace non-isalnum characters in variable names by _ instead of integral represantation of replaced character. Eliminate duplicates created by replacement.
2 parents 8a91b22 + 7d0a6ed commit f6fd509

File tree

3 files changed

+77
-0
lines changed

3 files changed

+77
-0
lines changed

doc/source/release.rst

+1
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ Improvements to existing features
221221
MultiIndex and Hierarchical Rows. Set the ``merge_cells`` to ``False`` to
222222
restore the previous behaviour. (:issue:`5254`)
223223
- The FRED DataReader now accepts multiple series (:issue`3413`)
224+
- StataWriter adjusts variable names to Stata's limitations (:issue:`5709`)
224225

225226
API Changes
226227
~~~~~~~~~~~

pandas/io/stata.py

+44
Original file line numberDiff line numberDiff line change
@@ -1068,11 +1068,55 @@ def _write_descriptors(self, typlist=None, varlist=None, srtlist=None,
10681068
self._write(typ)
10691069

10701070
# varlist, length 33*nvar, char array, null terminated
1071+
converted_names = []
1072+
duplicate_var_id = 0
1073+
for j, name in enumerate(self.varlist):
1074+
orig_name = name
1075+
# Replaces all characters disallowed in .dta format by their integral representation.
1076+
for c in name:
1077+
if (c < 'A' or c > 'Z') and (c < 'a' or c > 'z') and (c < '0' or c > '9') and c != '_':
1078+
name = name.replace(c, '_')
1079+
1080+
# Variable name may not start with a number
1081+
if name[0] > '0' and name[0] < '9':
1082+
name = '_' + name
1083+
1084+
name = name[:min(len(name), 32)]
1085+
1086+
if not name == orig_name:
1087+
# check for duplicates
1088+
while self.varlist.count(name) > 0:
1089+
# prepend ascending number to avoid duplicates
1090+
name = '_' + str(duplicate_var_id) + name
1091+
name = name[:min(len(name), 32)]
1092+
duplicate_var_id += 1
1093+
1094+
# need to possibly encode the orig name if its unicode
1095+
try:
1096+
orig_name = orig_name.encode('utf-8')
1097+
except:
1098+
pass
1099+
1100+
converted_names.append('{0} -> {1}'.format(orig_name, name))
1101+
self.varlist[j] = name
1102+
10711103
for name in self.varlist:
10721104
name = self._null_terminate(name, True)
10731105
name = _pad_bytes(name[:32], 33)
10741106
self._write(name)
10751107

1108+
if converted_names:
1109+
from warnings import warn
1110+
warn("""Not all pandas column names were valid Stata variable names.
1111+
Made the following replacements:
1112+
1113+
{0}
1114+
1115+
If this is not what you expect, please make sure you have Stata-compliant
1116+
column names in your DataFrame (max 32 characters, only alphanumerics and
1117+
underscores)/
1118+
""".format('\n '.join(converted_names)))
1119+
10761120
# srtlist, 2*(nvar+1), int array, encoded by byteorder
10771121
srtlist = _pad_bytes("", (2*(nvar+1)))
10781122
self._write(srtlist)

pandas/io/tests/test_stata.py

+32
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,38 @@ def test_encoding(self):
231231
self.assert_(result == expected)
232232
self.assert_(isinstance(result, unicode))
233233

234+
def test_read_write_dta11(self):
235+
original = DataFrame([(1, 2, 3, 4)],
236+
columns=['good', compat.u('b\u00E4d'), '8number', 'astringwithmorethan32characters______'])
237+
formatted = DataFrame([(1, 2, 3, 4)],
238+
columns=['good', 'b_d', '_8number', 'astringwithmorethan32characters_'])
239+
formatted.index.name = 'index'
240+
241+
with tm.ensure_clean() as path:
242+
with warnings.catch_warnings(record=True) as w:
243+
original.to_stata(path, None, False)
244+
np.testing.assert_equal(
245+
len(w), 1) # should get a warning for that format.
246+
247+
written_and_read_again = self.read_dta(path)
248+
tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)
249+
250+
def test_read_write_dta12(self):
251+
original = DataFrame([(1, 2, 3, 4)],
252+
columns=['astringwithmorethan32characters_1', 'astringwithmorethan32characters_2', '+', '-'])
253+
formatted = DataFrame([(1, 2, 3, 4)],
254+
columns=['astringwithmorethan32characters_', '_0astringwithmorethan32character', '_', '_1_'])
255+
formatted.index.name = 'index'
256+
257+
with tm.ensure_clean() as path:
258+
with warnings.catch_warnings(record=True) as w:
259+
original.to_stata(path, None, False)
260+
np.testing.assert_equal(
261+
len(w), 1) # should get a warning for that format.
262+
263+
written_and_read_again = self.read_dta(path)
264+
tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)
265+
234266
if __name__ == '__main__':
235267
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
236268
exit=False)

0 commit comments

Comments
 (0)