-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: Fix handling of encoding for the StataReader #21244 #21246
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,7 +37,8 @@ | |
from pandas.util._decorators import deprecate_kwarg | ||
|
||
VALID_ENCODINGS = ('ascii', 'us-ascii', 'latin-1', 'latin_1', 'iso-8859-1', | ||
'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1') | ||
'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1', | ||
'utf-8', 'utf8') | ||
|
||
_version_error = ("Version of given Stata file is not 104, 105, 108, " | ||
"111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), " | ||
|
@@ -1335,7 +1336,7 @@ def _calcsize(self, fmt): | |
|
||
def _decode(self, s): | ||
s = s.partition(b"\0")[0] | ||
return s.decode('utf-8') | ||
return s.decode(self._encoding or self._default_encoding) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should not be changed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Interesting... this is the line that was causing all the problems with my converter. So |
||
|
||
def _null_terminate(self, s): | ||
if compat.PY3 or self._encoding is not None: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -99,9 +99,9 @@ def setup_method(self, method): | |
|
||
self.stata_dates = os.path.join(self.dirpath, 'stata13_dates.dta') | ||
|
||
def read_dta(self, file): | ||
def read_dta(self, file, encoding='latin-1'): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should use |
||
# Legacy default reader configuration | ||
return read_stata(file, convert_dates=True) | ||
return read_stata(file, convert_dates=True, encoding=encoding) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same. |
||
|
||
def read_csv(self, file): | ||
return read_csv(file, parse_dates=True) | ||
|
@@ -268,7 +268,7 @@ def test_read_dta12(self): | |
tm.assert_frame_equal(parsed_117, expected, check_dtype=False) | ||
|
||
def test_read_dta18(self): | ||
parsed_118 = self.read_dta(self.dta22_118) | ||
parsed_118 = self.read_dta(self.dta22_118, encoding='utf-8') | ||
parsed_118["Bytes"] = parsed_118["Bytes"].astype('O') | ||
expected = DataFrame.from_records( | ||
[['Cat', 'Bogota', u'Bogotá', 1, 1.0, u'option b Ünicode', 1.0], | ||
|
@@ -283,7 +283,7 @@ def test_read_dta18(self): | |
for col in parsed_118.columns: | ||
tm.assert_almost_equal(parsed_118[col], expected[col]) | ||
|
||
with StataReader(self.dta22_118) as rdr: | ||
with StataReader(self.dta22_118, encoding='utf-8') as rdr: | ||
vl = rdr.variable_labels() | ||
vl_expected = {u'Unicode_Cities_Strl': | ||
u'Here are some strls with Ünicode chars', | ||
|
@@ -1358,7 +1358,7 @@ def test_invalid_encoding(self): | |
original = self.read_csv(self.csv3) | ||
with pytest.raises(ValueError): | ||
with tm.ensure_clean() as path: | ||
original.to_stata(path, encoding='utf-8') | ||
original.to_stata(path, encoding='pokemon') | ||
|
||
def test_path_pathlib(self): | ||
df = tm.makeDataFrame() | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These are not in general valid. The set of valid depends on the reader. For < 118 it is
for 118 it is
'utf-8', 'utf8'
.