Skip to content

Commit a5a2c92

Browse files
committed
Revert "StataWriter: Replace non-isalnum characters in variable names by _ instead of integral represantation of replaced character. Eliminate duplicates created by replacement."
Fails under py2.6 / win32 because of unicode issues This reverts commit c573630.
1 parent 5a750ce commit a5a2c92

File tree

3 files changed

+0
-75
lines changed

3 files changed

+0
-75
lines changed

doc/source/release.rst

-1
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,6 @@ Improvements to existing features
221221
MultiIndex and Hierarchical Rows. Set the ``merge_cells`` to ``False`` to
222222
restore the previous behaviour. (:issue:`5254`)
223223
- The FRED DataReader now accepts multiple series (:issue`3413`)
224-
- StataWriter adjusts variable names to Stata's limitations
225224

226225
API Changes
227226
~~~~~~~~~~~

pandas/io/stata.py

-38
Original file line numberDiff line numberDiff line change
@@ -957,49 +957,11 @@ def _write_descriptors(self, typlist=None, varlist=None, srtlist=None,
957957
self._write(typ)
958958

959959
# varlist, length 33*nvar, char array, null terminated
960-
converted_names = []
961-
duplicate_var_id = 0
962-
for j, name in enumerate(self.varlist):
963-
orig_name = name
964-
# Replaces all characters disallowed in .dta format by their integral representation.
965-
for c in name:
966-
if (c < 'A' or c > 'Z') and (c < 'a' or c > 'z') and (c < '0' or c > '9') and c != '_':
967-
name = name.replace(c, '_')
968-
969-
# Variable name may not start with a number
970-
if name[0] > '0' and name[0] < '9':
971-
name = '_' + name
972-
973-
name = name[:min(len(name), 32)]
974-
975-
if not name == orig_name:
976-
# check for duplicates
977-
while self.varlist.count(name) > 0:
978-
# prepend ascending number to avoid duplicates
979-
name = '_' + str(duplicate_var_id) + name
980-
name = name[:min(len(name), 32)]
981-
duplicate_var_id += 1
982-
983-
converted_names.append('{0} -> {1}'.format(orig_name, name))
984-
self.varlist[j] = name
985-
986960
for name in self.varlist:
987961
name = self._null_terminate(name, True)
988962
name = _pad_bytes(name[:32], 33)
989963
self._write(name)
990964

991-
if converted_names:
992-
from warnings import warn
993-
warn("""Not all pandas column names were valid Stata variable names.
994-
Made the following replacements:
995-
996-
{0}
997-
998-
If this is not what you expect, please make sure you have Stata-compliant
999-
column names in your DataFrame (max 32 characters, only alphanumerics and
1000-
underscores)/
1001-
""".format('\n '.join(converted_names)))
1002-
1003965
# srtlist, 2*(nvar+1), int array, encoded by byteorder
1004966
srtlist = _pad_bytes("", (2*(nvar+1)))
1005967
self._write(srtlist)

pandas/io/tests/test_stata.py

-36
Original file line numberDiff line numberDiff line change
@@ -231,42 +231,6 @@ def test_encoding(self):
231231
self.assert_(result == expected)
232232
self.assert_(isinstance(result, unicode))
233233

234-
def test_read_write_dta11(self):
235-
original = DataFrame([(1, 2, 3, 4)],
236-
columns=['good', 'bäd', '8number', 'astringwithmorethan32characters______'])
237-
if compat.PY3:
238-
formatted = DataFrame([(1, 2, 3, 4)],
239-
columns=['good', 'b_d', '_8number', 'astringwithmorethan32characters_'])
240-
else:
241-
formatted = DataFrame([(1, 2, 3, 4)],
242-
columns=['good', 'b__d', '_8number', 'astringwithmorethan32characters_'])
243-
formatted.index.name = 'index'
244-
245-
with tm.ensure_clean() as path:
246-
with warnings.catch_warnings(record=True) as w:
247-
original.to_stata(path, None, False)
248-
np.testing.assert_equal(
249-
len(w), 1) # should get a warning for that format.
250-
251-
written_and_read_again = self.read_dta(path)
252-
tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)
253-
254-
def test_read_write_dta12(self):
255-
original = DataFrame([(1, 2, 3, 4)],
256-
columns=['astringwithmorethan32characters_1', 'astringwithmorethan32characters_2', '+', '-'])
257-
formatted = DataFrame([(1, 2, 3, 4)],
258-
columns=['astringwithmorethan32characters_', '_0astringwithmorethan32character', '_', '_1_'])
259-
formatted.index.name = 'index'
260-
261-
with tm.ensure_clean() as path:
262-
with warnings.catch_warnings(record=True) as w:
263-
original.to_stata(path, None, False)
264-
np.testing.assert_equal(
265-
len(w), 1) # should get a warning for that format.
266-
267-
written_and_read_again = self.read_dta(path)
268-
tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)
269-
270234
if __name__ == '__main__':
271235
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
272236
exit=False)

0 commit comments

Comments
 (0)