From 2f0ada3d430f0cc49fa9ab1e7e2e2a7ec23c9616 Mon Sep 17 00:00:00 2001
From: Kevin Sheppard <kevin.sheppard@economics.ox.ac.uk>
Date: Tue, 13 Jan 2015 22:43:37 -0500
Subject: [PATCH] FIX: Fix encoding to allow StataReader to read urls

Fix encoding so that StataReader can correctly read URLs

closes #9231
---
 doc/source/whatsnew/v0.16.0.txt |  2 +-
 pandas/io/stata.py              | 16 +++++++++++-----
 pandas/io/tests/test_stata.py   |  1 -
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt
index 1528747891c64..b221a7df373a4 100644
--- a/doc/source/whatsnew/v0.16.0.txt
+++ b/doc/source/whatsnew/v0.16.0.txt
@@ -106,7 +106,7 @@ Bug Fixes
 - Bug in left ``join`` on multi-index with ``sort=True`` or null values (:issue:`9210`).
 
 
-
+- Fixed character encoding bug in ``read_stata`` and ``StataReader`` when loading data from a URL (:issue:`9231`).
 
 
 
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index d8ebb8027c4ce..ccfe8468813c7 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -44,7 +44,7 @@ def read_stata(filepath_or_buffer, convert_dates=True,
         Read value labels and convert columns to Categorical/Factor variables
     encoding : string, None or encoding
         Encoding used to parse the files. Note that Stata doesn't
-        support unicode. None defaults to cp1252.
+        support unicode. None defaults to iso-8859-1.
     index : identifier of index column
         identifier of column that should be used as index of the DataFrame
     convert_missing : boolean, defaults to False
@@ -683,7 +683,7 @@ def get_base_missing_value(cls, dtype):
 
 
 class StataParser(object):
-    _default_encoding = 'cp1252'
+    _default_encoding = 'iso-8859-1'
 
     def __init__(self, encoding):
         self._encoding = encoding
@@ -823,10 +823,10 @@ class StataReader(StataParser):
         Path to .dta file or object implementing a binary read() functions
     encoding : string, None or encoding
         Encoding used to parse the files. Note that Stata doesn't
-        support unicode. None defaults to cp1252.
+        support unicode. None defaults to iso-8859-1.
     """
 
-    def __init__(self, path_or_buf, encoding='cp1252'):
+    def __init__(self, path_or_buf, encoding='iso-8859-1'):
         super(StataReader, self).__init__(encoding)
         self.col_sizes = ()
         self._has_string_data = False
@@ -841,7 +841,13 @@ def __init__(self, path_or_buf, encoding='cp1252'):
         if isinstance(path_or_buf, (str, compat.text_type, bytes)):
             self.path_or_buf = open(path_or_buf, 'rb')
         else:
-            self.path_or_buf = path_or_buf
+            # Copy to BytesIO, and ensure no encoding
+            contents = path_or_buf.read()
+            try:
+                contents = contents.encode(self._default_encoding)
+            except:
+                pass
+            self.path_or_buf = BytesIO(contents)
 
         self._read_header()
 
diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py
index 6a3c16655745e..f0ebebc1f143f 100644
--- a/pandas/io/tests/test_stata.py
+++ b/pandas/io/tests/test_stata.py
@@ -889,7 +889,6 @@ def test_categorical_ordering(self):
             tm.assert_equal(False, parsed_115_unordered[col].cat.ordered)
             tm.assert_equal(False, parsed_117_unordered[col].cat.ordered)
 
-
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
                    exit=False)