FIX: Fix encoding to allow StataReader to read urls

bashtage · bashtage · commit 2f0ada3d430f · 2015-01-15T21:42:23.000-05:00
Fix encoding so that StataReader can correctly read URLs closes pandas-dev#9231
diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt
@@ -106,7 +106,7 @@ Bug Fixes
 - Bug in left ``join`` on multi-index with ``sort=True`` or null values (:issue:`9210`).
 
 
-
+- Fixed character encoding bug in ``read_stata`` and ``StataReader`` when loading data from a URL (:issue:`9231`).
 
 
 
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -44,7 +44,7 @@ def read_stata(filepath_or_buffer, convert_dates=True,
         Read value labels and convert columns to Categorical/Factor variables
     encoding : string, None or encoding
         Encoding used to parse the files. Note that Stata doesn't
-        support unicode. None defaults to cp1252.
+        support unicode. None defaults to iso-8859-1.
     index : identifier of index column
         identifier of column that should be used as index of the DataFrame
     convert_missing : boolean, defaults to False
@@ -683,7 +683,7 @@ def get_base_missing_value(cls, dtype):
 
 
 class StataParser(object):
-    _default_encoding = 'cp1252'
+    _default_encoding = 'iso-8859-1'
 
     def __init__(self, encoding):
         self._encoding = encoding
@@ -823,10 +823,10 @@ class StataReader(StataParser):
         Path to .dta file or object implementing a binary read() functions
     encoding : string, None or encoding
         Encoding used to parse the files. Note that Stata doesn't
-        support unicode. None defaults to cp1252.
+        support unicode. None defaults to iso-8859-1.
     """
 
-    def __init__(self, path_or_buf, encoding='cp1252'):
+    def __init__(self, path_or_buf, encoding='iso-8859-1'):
         super(StataReader, self).__init__(encoding)
         self.col_sizes = ()
         self._has_string_data = False
@@ -841,7 +841,13 @@ def __init__(self, path_or_buf, encoding='cp1252'):
         if isinstance(path_or_buf, (str, compat.text_type, bytes)):
             self.path_or_buf = open(path_or_buf, 'rb')
         else:
-            self.path_or_buf = path_or_buf
+            # Copy to BytesIO, and ensure no encoding
+            contents = path_or_buf.read()
+            try:
+                contents = contents.encode(self._default_encoding)
+            except:
+                pass
+            self.path_or_buf = BytesIO(contents)
 
         self._read_header()
 
diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py
@@ -889,7 +889,6 @@ def test_categorical_ordering(self):
             tm.assert_equal(False, parsed_115_unordered[col].cat.ordered)
             tm.assert_equal(False, parsed_117_unordered[col].cat.ordered)
 
-
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
                    exit=False)

Original file line number	Diff line number	Diff line change
`@@ -106,7 +106,7 @@ Bug Fixes`
`106`	`106`	- Bug in left ``join`` on multi-index with ``sort=True`` or null values (:issue:`9210`).
`107`	`107`
`108`	`108`
`109`		`-`
	`109`	+- Fixed character encoding bug in ``read_stata`` and ``StataReader`` when loading data from a URL (:issue:`9231`).
`110`	`110`
`111`	`111`
`112`	`112`