Skip to content

FIX: Fix encoding to allow StataReader to read urls #9245

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 16, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.16.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ Bug Fixes
- Bug in left ``join`` on multi-index with ``sort=True`` or null values (:issue:`9210`).



- Fixed character encoding bug in ``read_stata`` and ``StataReader`` when loading data from a URL (:issue:`9231`).



Expand Down
16 changes: 11 additions & 5 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def read_stata(filepath_or_buffer, convert_dates=True,
Read value labels and convert columns to Categorical/Factor variables
encoding : string, None or encoding
Encoding used to parse the files. Note that Stata doesn't
support unicode. None defaults to cp1252.
support unicode. None defaults to iso-8859-1.
index : identifier of index column
identifier of column that should be used as index of the DataFrame
convert_missing : boolean, defaults to False
Expand Down Expand Up @@ -683,7 +683,7 @@ def get_base_missing_value(cls, dtype):


class StataParser(object):
_default_encoding = 'cp1252'
_default_encoding = 'iso-8859-1'

def __init__(self, encoding):
self._encoding = encoding
Expand Down Expand Up @@ -823,10 +823,10 @@ class StataReader(StataParser):
Path to .dta file or object implementing a binary read() functions
encoding : string, None or encoding
Encoding used to parse the files. Note that Stata doesn't
support unicode. None defaults to cp1252.
support unicode. None defaults to iso-8859-1.
"""

def __init__(self, path_or_buf, encoding='cp1252'):
def __init__(self, path_or_buf, encoding='iso-8859-1'):
super(StataReader, self).__init__(encoding)
self.col_sizes = ()
self._has_string_data = False
Expand All @@ -841,7 +841,13 @@ def __init__(self, path_or_buf, encoding='cp1252'):
if isinstance(path_or_buf, (str, compat.text_type, bytes)):
self.path_or_buf = open(path_or_buf, 'rb')
else:
self.path_or_buf = path_or_buf
# Copy to BytesIO, and ensure no encoding
contents = path_or_buf.read()
try:
contents = contents.encode(self._default_encoding)
except:
pass
self.path_or_buf = BytesIO(contents)

self._read_header()

Expand Down
1 change: 0 additions & 1 deletion pandas/io/tests/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -889,7 +889,6 @@ def test_categorical_ordering(self):
tm.assert_equal(False, parsed_115_unordered[col].cat.ordered)
tm.assert_equal(False, parsed_117_unordered[col].cat.ordered)


if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)
Expand Down