Merge pull request #5525 from PKEuS/master

jreback · jreback · commit 62aa9ec3bfc7 · 2013-12-04T05:02:57.000-08:00
StataWriter: Replace non-isalnum characters in variable names by _ inste...
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -221,6 +221,7 @@ Improvements to existing features
     MultiIndex and Hierarchical Rows. Set the ``merge_cells`` to ``False`` to
     restore the previous behaviour.  (:issue:`5254`)
   - The FRED DataReader now accepts multiple series (:issue`3413`)
+  - StataWriter adjusts variable names to Stata's limitations
 
 API Changes
 ~~~~~~~~~~~
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -957,11 +957,49 @@ def _write_descriptors(self, typlist=None, varlist=None, srtlist=None,
             self._write(typ)
 
         # varlist, length 33*nvar, char array, null terminated
+        converted_names = []
+        duplicate_var_id = 0
+        for j, name in enumerate(self.varlist):
+            orig_name = name
+            # Replaces all characters disallowed in .dta format by their integral representation.
+            for c in name:
+                if (c < 'A' or c > 'Z') and (c < 'a' or c > 'z') and (c < '0' or c > '9') and c != '_':
+                    name = name.replace(c, '_')
+
+            # Variable name may not start with a number
+            if name[0] > '0' and name[0] < '9':
+                name = '_' + name
+
+            name = name[:min(len(name), 32)]
+
+            if not name == orig_name:
+                # check for duplicates
+                while self.varlist.count(name) > 0:
+                    # prepend ascending number to avoid duplicates
+                    name = '_' + str(duplicate_var_id) + name
+                    name = name[:min(len(name), 32)]
+                    duplicate_var_id += 1
+
+                converted_names.append('{0}    ->    {1}'.format(orig_name, name))
+                self.varlist[j] = name
+
         for name in self.varlist:
             name = self._null_terminate(name, True)
             name = _pad_bytes(name[:32], 33)
             self._write(name)
 
+        if converted_names:
+            from warnings import warn
+            warn("""Not all pandas column names were valid Stata variable names.
+                Made the following replacements:
+
+                    {0}
+
+                If this is not what you expect, please make sure you have Stata-compliant
+                column names in your DataFrame (max 32 characters, only alphanumerics and
+                underscores)/
+                """.format('\n    '.join(converted_names)))
+
         # srtlist, 2*(nvar+1), int array, encoded by byteorder
         srtlist = _pad_bytes("", (2*(nvar+1)))
         self._write(srtlist)
diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py
@@ -231,6 +231,42 @@ def test_encoding(self):
             self.assert_(result == expected)
             self.assert_(isinstance(result, unicode))
 
+    def test_read_write_dta11(self):
+        original = DataFrame([(1, 2, 3, 4)],
+                             columns=['good', 'bäd', '8number', 'astringwithmorethan32characters______'])
+        if compat.PY3:
+            formatted = DataFrame([(1, 2, 3, 4)],
+                                  columns=['good', 'b_d', '_8number', 'astringwithmorethan32characters_'])
+        else:
+            formatted = DataFrame([(1, 2, 3, 4)],
+                                  columns=['good', 'b__d', '_8number', 'astringwithmorethan32characters_'])
+        formatted.index.name = 'index'
+
+        with tm.ensure_clean() as path:
+            with warnings.catch_warnings(record=True) as w:
+                original.to_stata(path, None, False)
+                np.testing.assert_equal(
+                    len(w), 1)  # should get a warning for that format.
+
+            written_and_read_again = self.read_dta(path)
+            tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)
+
+    def test_read_write_dta12(self):
+        original = DataFrame([(1, 2, 3, 4)],
+                             columns=['astringwithmorethan32characters_1', 'astringwithmorethan32characters_2', '+', '-'])
+        formatted = DataFrame([(1, 2, 3, 4)],
+                              columns=['astringwithmorethan32characters_', '_0astringwithmorethan32character', '_', '_1_'])
+        formatted.index.name = 'index'
+
+        with tm.ensure_clean() as path:
+            with warnings.catch_warnings(record=True) as w:
+                original.to_stata(path, None, False)
+                np.testing.assert_equal(
+                    len(w), 1)  # should get a warning for that format.
+
+            written_and_read_again = self.read_dta(path)
+            tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)
+
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
                    exit=False)