Revert "StataWriter: Replace non-isalnum characters in variable names by _ instead of integral represantation of replaced character. Eliminate duplicates created by replacement."

jreback · jreback · commit a5a2c9210b93 · 2013-12-04T11:08:02.000-05:00
Fails under py2.6 / win32 because of unicode issues This reverts commit c573630.
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -221,7 +221,6 @@ Improvements to existing features
     MultiIndex and Hierarchical Rows. Set the ``merge_cells`` to ``False`` to
     restore the previous behaviour.  (:issue:`5254`)
   - The FRED DataReader now accepts multiple series (:issue`3413`)
-  - StataWriter adjusts variable names to Stata's limitations
 
 API Changes
 ~~~~~~~~~~~
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -957,49 +957,11 @@ def _write_descriptors(self, typlist=None, varlist=None, srtlist=None,
             self._write(typ)
 
         # varlist, length 33*nvar, char array, null terminated
-        converted_names = []
-        duplicate_var_id = 0
-        for j, name in enumerate(self.varlist):
-            orig_name = name
-            # Replaces all characters disallowed in .dta format by their integral representation.
-            for c in name:
-                if (c < 'A' or c > 'Z') and (c < 'a' or c > 'z') and (c < '0' or c > '9') and c != '_':
-                    name = name.replace(c, '_')
-
-            # Variable name may not start with a number
-            if name[0] > '0' and name[0] < '9':
-                name = '_' + name
-
-            name = name[:min(len(name), 32)]
-
-            if not name == orig_name:
-                # check for duplicates
-                while self.varlist.count(name) > 0:
-                    # prepend ascending number to avoid duplicates
-                    name = '_' + str(duplicate_var_id) + name
-                    name = name[:min(len(name), 32)]
-                    duplicate_var_id += 1
-
-                converted_names.append('{0}    ->    {1}'.format(orig_name, name))
-                self.varlist[j] = name
-
         for name in self.varlist:
             name = self._null_terminate(name, True)
             name = _pad_bytes(name[:32], 33)
             self._write(name)
 
-        if converted_names:
-            from warnings import warn
-            warn("""Not all pandas column names were valid Stata variable names.
-                Made the following replacements:
-
-                    {0}
-
-                If this is not what you expect, please make sure you have Stata-compliant
-                column names in your DataFrame (max 32 characters, only alphanumerics and
-                underscores)/
-                """.format('\n    '.join(converted_names)))
-
         # srtlist, 2*(nvar+1), int array, encoded by byteorder
         srtlist = _pad_bytes("", (2*(nvar+1)))
         self._write(srtlist)
diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py
@@ -231,42 +231,6 @@ def test_encoding(self):
             self.assert_(result == expected)
             self.assert_(isinstance(result, unicode))
 
-    def test_read_write_dta11(self):
-        original = DataFrame([(1, 2, 3, 4)],
-                             columns=['good', 'bäd', '8number', 'astringwithmorethan32characters______'])
-        if compat.PY3:
-            formatted = DataFrame([(1, 2, 3, 4)],
-                                  columns=['good', 'b_d', '_8number', 'astringwithmorethan32characters_'])
-        else:
-            formatted = DataFrame([(1, 2, 3, 4)],
-                                  columns=['good', 'b__d', '_8number', 'astringwithmorethan32characters_'])
-        formatted.index.name = 'index'
-
-        with tm.ensure_clean() as path:
-            with warnings.catch_warnings(record=True) as w:
-                original.to_stata(path, None, False)
-                np.testing.assert_equal(
-                    len(w), 1)  # should get a warning for that format.
-
-            written_and_read_again = self.read_dta(path)
-            tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)
-
-    def test_read_write_dta12(self):
-        original = DataFrame([(1, 2, 3, 4)],
-                             columns=['astringwithmorethan32characters_1', 'astringwithmorethan32characters_2', '+', '-'])
-        formatted = DataFrame([(1, 2, 3, 4)],
-                              columns=['astringwithmorethan32characters_', '_0astringwithmorethan32character', '_', '_1_'])
-        formatted.index.name = 'index'
-
-        with tm.ensure_clean() as path:
-            with warnings.catch_warnings(record=True) as w:
-                original.to_stata(path, None, False)
-                np.testing.assert_equal(
-                    len(w), 1)  # should get a warning for that format.
-
-            written_and_read_again = self.read_dta(path)
-            tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)
-
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
                    exit=False)