From 2f0ada3d430f0cc49fa9ab1e7e2e2a7ec23c9616 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Tue, 13 Jan 2015 22:43:37 -0500 Subject: [PATCH] FIX: Fix encoding to allow StataReader to read urls Fix encoding so that StataReader can correctly read URLs closes #9231 --- doc/source/whatsnew/v0.16.0.txt | 2 +- pandas/io/stata.py | 16 +++++++++++----- pandas/io/tests/test_stata.py | 1 - 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index 1528747891c64..b221a7df373a4 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -106,7 +106,7 @@ Bug Fixes - Bug in left ``join`` on multi-index with ``sort=True`` or null values (:issue:`9210`). - +- Fixed character encoding bug in ``read_stata`` and ``StataReader`` when loading data from a URL (:issue:`9231`). diff --git a/pandas/io/stata.py b/pandas/io/stata.py index d8ebb8027c4ce..ccfe8468813c7 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -44,7 +44,7 @@ def read_stata(filepath_or_buffer, convert_dates=True, Read value labels and convert columns to Categorical/Factor variables encoding : string, None or encoding Encoding used to parse the files. Note that Stata doesn't - support unicode. None defaults to cp1252. + support unicode. None defaults to iso-8859-1. index : identifier of index column identifier of column that should be used as index of the DataFrame convert_missing : boolean, defaults to False @@ -683,7 +683,7 @@ def get_base_missing_value(cls, dtype): class StataParser(object): - _default_encoding = 'cp1252' + _default_encoding = 'iso-8859-1' def __init__(self, encoding): self._encoding = encoding @@ -823,10 +823,10 @@ class StataReader(StataParser): Path to .dta file or object implementing a binary read() functions encoding : string, None or encoding Encoding used to parse the files. Note that Stata doesn't - support unicode. None defaults to cp1252. + support unicode. None defaults to iso-8859-1. """ - def __init__(self, path_or_buf, encoding='cp1252'): + def __init__(self, path_or_buf, encoding='iso-8859-1'): super(StataReader, self).__init__(encoding) self.col_sizes = () self._has_string_data = False @@ -841,7 +841,13 @@ def __init__(self, path_or_buf, encoding='cp1252'): if isinstance(path_or_buf, (str, compat.text_type, bytes)): self.path_or_buf = open(path_or_buf, 'rb') else: - self.path_or_buf = path_or_buf + # Copy to BytesIO, and ensure no encoding + contents = path_or_buf.read() + try: + contents = contents.encode(self._default_encoding) + except: + pass + self.path_or_buf = BytesIO(contents) self._read_header() diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 6a3c16655745e..f0ebebc1f143f 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -889,7 +889,6 @@ def test_categorical_ordering(self): tm.assert_equal(False, parsed_115_unordered[col].cat.ordered) tm.assert_equal(False, parsed_117_unordered[col].cat.ordered) - if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False)