Skip to content

Commit 2f0ada3

Browse files
committed
FIX: Fix encoding to allow StataReader to read urls
Fix encoding so that StataReader can correctly read URLs closes pandas-dev#9231
1 parent bbddca4 commit 2f0ada3

File tree

3 files changed

+12
-7
lines changed

3 files changed

+12
-7
lines changed

doc/source/whatsnew/v0.16.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ Bug Fixes
106106
- Bug in left ``join`` on multi-index with ``sort=True`` or null values (:issue:`9210`).
107107

108108

109-
109+
- Fixed character encoding bug in ``read_stata`` and ``StataReader`` when loading data from a URL (:issue:`9231`).
110110

111111

112112

pandas/io/stata.py

+11-5
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def read_stata(filepath_or_buffer, convert_dates=True,
4444
Read value labels and convert columns to Categorical/Factor variables
4545
encoding : string, None or encoding
4646
Encoding used to parse the files. Note that Stata doesn't
47-
support unicode. None defaults to cp1252.
47+
support unicode. None defaults to iso-8859-1.
4848
index : identifier of index column
4949
identifier of column that should be used as index of the DataFrame
5050
convert_missing : boolean, defaults to False
@@ -683,7 +683,7 @@ def get_base_missing_value(cls, dtype):
683683

684684

685685
class StataParser(object):
686-
_default_encoding = 'cp1252'
686+
_default_encoding = 'iso-8859-1'
687687

688688
def __init__(self, encoding):
689689
self._encoding = encoding
@@ -823,10 +823,10 @@ class StataReader(StataParser):
823823
Path to .dta file or object implementing a binary read() functions
824824
encoding : string, None or encoding
825825
Encoding used to parse the files. Note that Stata doesn't
826-
support unicode. None defaults to cp1252.
826+
support unicode. None defaults to iso-8859-1.
827827
"""
828828

829-
def __init__(self, path_or_buf, encoding='cp1252'):
829+
def __init__(self, path_or_buf, encoding='iso-8859-1'):
830830
super(StataReader, self).__init__(encoding)
831831
self.col_sizes = ()
832832
self._has_string_data = False
@@ -841,7 +841,13 @@ def __init__(self, path_or_buf, encoding='cp1252'):
841841
if isinstance(path_or_buf, (str, compat.text_type, bytes)):
842842
self.path_or_buf = open(path_or_buf, 'rb')
843843
else:
844-
self.path_or_buf = path_or_buf
844+
# Copy to BytesIO, and ensure no encoding
845+
contents = path_or_buf.read()
846+
try:
847+
contents = contents.encode(self._default_encoding)
848+
except:
849+
pass
850+
self.path_or_buf = BytesIO(contents)
845851

846852
self._read_header()
847853

pandas/io/tests/test_stata.py

-1
Original file line numberDiff line numberDiff line change
@@ -889,7 +889,6 @@ def test_categorical_ordering(self):
889889
tm.assert_equal(False, parsed_115_unordered[col].cat.ordered)
890890
tm.assert_equal(False, parsed_117_unordered[col].cat.ordered)
891891

892-
893892
if __name__ == '__main__':
894893
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
895894
exit=False)

0 commit comments

Comments
 (0)