Skip to content

Read CSV using c engine silently swallows useful exceptions #13652

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
bdrosen96 opened this issue Jul 14, 2016 · 17 comments
Closed

Read CSV using c engine silently swallows useful exceptions #13652

bdrosen96 opened this issue Jul 14, 2016 · 17 comments
Labels
Error Reporting Incorrect or improved errors from pandas IO CSV read_csv, to_csv Unicode Unicode strings
Milestone

Comments

@bdrosen96
Copy link
Contributor

Code Sample, a copy-pastable example if possible

import pandas
import codecs
import traceback
import sys

pandas.show_versions()

print("Pandas version: {}\n".format(pandas.__version__))
print("Python version: {}\n".format(sys.version_info))

def build_stream():
    filename = "tests/data/sauron.SHIFT_JIS.csv"
    handle = open(filename, "rb")
    codec = codecs.lookup("utf-8")
    utf8 = codecs.lookup('utf-8')
    # stream must be binary UTF8
    stream = codecs.StreamRecoder(
        handle, utf8.encode, utf8.decode, codec.streamreader, codec.streamwriter)
    return stream

def test_pandas(use_python):
    stream = build_stream()

    if use_python:
        engine = 'python'
    else:
        engine = 'c'
    df = pandas.read_csv(stream, engine=engine)


print("Showing stream error on read\n")
try:
    stream = build_stream()
    data = stream.read()
except Exception as exc:
    traceback.print_exc(file=sys.stdout)


print("Showing stream error on read_csv (python engine)\n")
try:
    stream = test_pandas(True)
except Exception as exc:
    traceback.print_exc(file=sys.stdout)


print("Showing missing stream error on read_csv (python c)\n")
try:
    stream = test_pandas(False)
except Exception as exc:
    traceback.print_exc(file=sys.stdout)
...

Actual Output


INSTALLED VERSIONS
------------------
commit: None
python: 3.4.3.final.0
python-bits: 64
OS: Linux
OS-release: 3.13.0-55-generic
machine: x86_64
processor: x86_64
byteorder: little
LC_ALL: None
LANG: en_US.UTF-8

pandas: 0.16.1
nose: 1.3.7
Cython: 0.24.0a0
numpy: 1.9.2
scipy: 0.16.0
statsmodels: None
IPython: 4.1.2
sphinx: None
patsy: None
dateutil: 2.4.2
pytz: 2015.4
bottleneck: None
tables: None
numexpr: 2.4.3
matplotlib: None
openpyxl: None
xlrd: 0.9.4
xlwt: None
xlsxwriter: None
lxml: None
bs4: None
html5lib: 0.9999999
httplib2: None
apiclient: None
sqlalchemy: None
pymysql: None
psycopg2: None
Pandas version: 0.16.1

Python version: sys.version_info(major=3, minor=4, micro=3, releaselevel='final', serial=0)

Showing stream error on read

Traceback (most recent call last):
  File "pandas_bug.py", line 34, in <module>
    data = stream.read()
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/codecs.py", line 798, in read
    data = self.reader.read(size)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/codecs.py", line 497, in read
    newchars, decodedbytes = self.decode(data, self.errors)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x83 in position 12: invalid start byte
Showing stream error on read_csv (python engine)

Traceback (most recent call last):
  File "pandas_bug.py", line 41, in <module>
    stream = test_pandas(True)
  File "pandas_bug.py", line 28, in test_pandas
    df = pandas.read_csv(stream, engine=engine)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas/io/parsers.py", line 474, in parser_f
    return _read(filepath_or_buffer, kwds)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas/io/parsers.py", line 250, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas/io/parsers.py", line 566, in __init__
    self._make_engine(self.engine)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas/io/parsers.py", line 711, in _make_engine
    self._engine = klass(self.f, **self.options)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas/io/parsers.py", line 1427, in __init__
    self.columns, self.num_original_columns = self._infer_columns()
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas/io/parsers.py", line 1642, in _infer_columns
    line = self._buffered_line()
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas/io/parsers.py", line 1769, in _buffered_line
    return self._next_line()
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas/io/parsers.py", line 1800, in _next_line
    orig_line = next(self.data)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/codecs.py", line 820, in __next__
    data = next(self.reader)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/codecs.py", line 638, in __next__
    line = self.readline()
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/codecs.py", line 551, in readline
    data = self.read(readsize, firstline=True)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/codecs.py", line 497, in read
    newchars, decodedbytes = self.decode(data, self.errors)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x83 in position 0: invalid start byte
Showing missing stream error on read_csv (python c)

Traceback (most recent call last):
  File "pandas_bug.py", line 48, in <module>
    stream = test_pandas(False)
  File "pandas_bug.py", line 28, in test_pandas
    df = pandas.read_csv(stream, engine=engine)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas/io/parsers.py", line 474, in parser_f
    return _read(filepath_or_buffer, kwds)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas/io/parsers.py", line 250, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas/io/parsers.py", line 566, in __init__
    self._make_engine(self.engine)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas/io/parsers.py", line 705, in _make_engine
    self._engine = CParserWrapper(self.f, **self.options)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas/io/parsers.py", line 1072, in __init__
    self._reader = _parser.TextReader(src, **kwds)
  File "pandas/parser.pyx", line 509, in pandas.parser.TextReader.__cinit__ (pandas/parser.c:4732)
  File "pandas/parser.pyx", line 635, in pandas.parser.TextReader._get_header (pandas/parser.c:6244)
  File "pandas/parser.pyx", line 831, in pandas.parser.TextReader._tokenize_rows (pandas/parser.c:8275)
  File "pandas/parser.pyx", line 1742, in pandas.parser.raise_parser_error (pandas/parser.c:20691)
pandas.parser.CParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

Expected Behavior

The C engine should behave like python engine. This should be possible by using PyErr_Occurred .

@jreback
Copy link
Contributor

jreback commented Jul 14, 2016

you have a pretty old version of pandas, current is 0.18.1 and 0.19.0 releasing soon.

and you can simply pass in the the encoding argument if you need to.

I am closing, but if you can provide a copy-pastable example that reproduces a non-obvious error on latest, then pls reopen.

@jreback jreback closed this as completed Jul 14, 2016
@jreback jreback added Unicode Unicode strings Error Reporting Incorrect or improved errors from pandas IO CSV read_csv, to_csv labels Jul 14, 2016
@bdrosen96
Copy link
Contributor Author

This should not have been closed.

1 Even though the version I have is old, I think this issue still exists in latest version.

2 The encoding issue was just an example that was easy to produce and should not be dismised because of the existence of the encoding option. If the file handle was a socket and the connection was reset, it would also raise an exception and there would not be a workaround

@bdrosen96
Copy link
Contributor Author

I just verified this with same code using newer pandas


INSTALLED VERSIONS
------------------
commit: None
python: 3.4.3.final.0
python-bits: 64
OS: Linux
OS-release: 3.13.0-55-generic
machine: x86_64
processor: x86_64
byteorder: little
LC_ALL: None
LANG: en_US.UTF-8

pandas: 0.18.1
nose: 1.3.7
pip: 7.1.0
setuptools: 20.2.2
Cython: 0.24.0a0
numpy: 1.9.2
scipy: 0.16.0
statsmodels: None
xarray: None
IPython: 4.1.2
sphinx: None
patsy: None
dateutil: 2.4.2
pytz: 2015.4
blosc: None
bottleneck: None
tables: None
numexpr: 2.4.3
matplotlib: None
openpyxl: None
xlrd: 0.9.4
xlwt: None
xlsxwriter: None
lxml: None
bs4: None
html5lib: 0.9999999
httplib2: None
apiclient: None
sqlalchemy: None
pymysql: None
psycopg2: None
jinja2: None
boto: 2.38.0
pandas_datareader: None
Pandas version: 0.18.1

Python version: sys.version_info(major=3, minor=4, micro=3, releaselevel='final', serial=0)

Showing stream error on read

Traceback (most recent call last):
  File "pandas_bug.py", line 34, in <module>
    data = stream.read()
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/codecs.py", line 798, in read
    data = self.reader.read(size)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/codecs.py", line 497, in read
    newchars, decodedbytes = self.decode(data, self.errors)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x83 in position 12: invalid start byte
Showing stream error on read_csv (python engine)

Traceback (most recent call last):
  File "pandas_bug.py", line 41, in <module>
    stream = test_pandas(True)
  File "pandas_bug.py", line 28, in test_pandas
    df = pandas.read_csv(stream, engine=engine)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas/io/parsers.py", line 562, in parser_f
    return _read(filepath_or_buffer, kwds)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas/io/parsers.py", line 315, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas/io/parsers.py", line 645, in __init__
    self._make_engine(self.engine)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas/io/parsers.py", line 805, in _make_engine
    self._engine = klass(self.f, **self.options)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas/io/parsers.py", line 1608, in __init__
    self.columns, self.num_original_columns = self._infer_columns()
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas/io/parsers.py", line 1823, in _infer_columns
    line = self._buffered_line()
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas/io/parsers.py", line 1975, in _buffered_line
    return self._next_line()
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas/io/parsers.py", line 2006, in _next_line
    orig_line = next(self.data)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/codecs.py", line 820, in __next__
    data = next(self.reader)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/codecs.py", line 638, in __next__
    line = self.readline()
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/codecs.py", line 551, in readline
    data = self.read(readsize, firstline=True)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/codecs.py", line 497, in read
    newchars, decodedbytes = self.decode(data, self.errors)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x83 in position 0: invalid start byte
Showing missing stream error on read_csv (python c)

Traceback (most recent call last):
  File "pandas_bug.py", line 48, in <module>
    stream = test_pandas(False)
  File "pandas_bug.py", line 28, in test_pandas
    df = pandas.read_csv(stream, engine=engine)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas/io/parsers.py", line 562, in parser_f
    return _read(filepath_or_buffer, kwds)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas/io/parsers.py", line 315, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas/io/parsers.py", line 645, in __init__
    self._make_engine(self.engine)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas/io/parsers.py", line 799, in _make_engine
    self._engine = CParserWrapper(self.f, **self.options)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas/io/parsers.py", line 1213, in __init__
    self._reader = _parser.TextReader(src, **kwds)
  File "pandas/parser.pyx", line 520, in pandas.parser.TextReader.__cinit__ (pandas/parser.c:5129)
  File "pandas/parser.pyx", line 671, in pandas.parser.TextReader._get_header (pandas/parser.c:7259)
  File "pandas/parser.pyx", line 868, in pandas.parser.TextReader._tokenize_rows (pandas/parser.c:9602)
  File "pandas/parser.pyx", line 1865, in pandas.parser.raise_parser_error (pandas/parser.c:23325)
pandas.io.common.CParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

@bdrosen96
Copy link
Contributor Author

I do not have permissions to reopen this issue. Can you do so?

@jreback
Copy link
Contributor

jreback commented Jul 14, 2016

@bdrosen96 then pls show an example which can be copy-pasted. e.g. your file is not there. you need to create it to repro (e.g. write out a test csv file or something), better yet is to use StringIO

@jreback
Copy link
Contributor

jreback commented Jul 14, 2016

and you are testing on 0.18.1, I am pretty sure these are already fixed on master.

@bdrosen96
Copy link
Contributor Author

import pandas
import codecs
import traceback
import sys
import io

# Data in UTF-8

DATA = '''num, text
1,サウロン(Sauron、アイヌアの創造の時 - 第三紀3019年3月25日)は、J・R・R・トールキンの中つ国を舞台とした小説
『ホビットの冒険』『指輪物語』『シルマリルの物語』の登場人物。
2,『ホビットの冒険』に言及のある「死人うらない師」(映画『ホビットシリーズ』の字幕では「死人遣い(ネクロマンサー)」)とは彼のことである。
3,その続編である『指輪物語』においては「一つの指輪(the One Ring)」の作り主、「冥王(Dark Lord)」、「かの者
(the One)[1]」として登場する。前史にあたる『シルマリルの物語』では、初代の冥王モルゴスの最も力ある側近であった。
4,サウロンは元来、アルダ(地球)の創造を担った天使的種族アイヌアの一員であったが、主メルコールの反逆に加担して堕落し、アルダに害をなす存在となった。
5,「サウロン」とはクウェンヤで「身の毛のよだつもの」という意味であり、シンダリンで同様の意味である名前「ゴルサウア」と呼ばれることもある。
6,これらは、サウロンを恐れ、忌み嫌ったエルフによる名であり、『指輪物語』作中においてアラゴルンは「かれ(サウロン)は自分の本当の名は使わないし、それを字に書いたり口に出したりすることも許さない」と発言している。
7,そのほか、第二紀にエルフに対して自称したとされる名に、「アンナタール(物贈る君)」、「アルタノ(高貴な細工師)」、「アウレンディル(アウレの下僕)」がある。
8,第一紀の頃のサウロンは、自在に変身する能力を持っていた。
9,その能力を使えば見目麗しい立派な外見を装うことや、また巨大な狼や吸血こうもりといった怪物に変じることもでき、エルフから恐れられた。
10,第二紀に一つの指輪を作り上げたサウロンは、他の力の指輪で成される事柄やその所有者を支配できるようになった。
11,また、肉体が滅びても指輪がある限り何度でも蘇ることができた。
12,ただしヌーメノール没落の際に美しい肉体を破壊された後は、二度と美しく変身することはできなくなり、その悪意の
具現のような見るも恐ろしい姿しかとれなくなったという。
13,またしばしば「まぶたのない火に縁取られた目」といった心象表現で捉えられた。
'''

pandas.show_versions()

print("Pandas version: {}\n".format(pandas.__version__))
print("Python version: {}\n".format(sys.version_info))

def build_stream():

    bytes_data = DATA.encode("shift-jis")
    handle = io.BytesIO(bytes_data)
    codec = codecs.lookup("utf-8")
    utf8 = codecs.lookup('utf-8')
    # stream must be binary UTF8
    stream = codecs.StreamRecoder(
        handle, utf8.encode, utf8.decode, codec.streamreader, codec.streamwriter)
    return stream

def test_pandas(use_python):
    stream = build_stream()

    if use_python:
        engine = 'python'
    else:
        engine = 'c'
    df = pandas.read_csv(stream, engine=engine)


print("Showing stream error on read\n")
try:
    stream = build_stream()
    data = stream.read()
except Exception as exc:
    traceback.print_exc(file=sys.stdout)


print("Showing stream error on read_csv (python engine)\n")
try:
    stream = test_pandas(True)
except Exception as exc:
    traceback.print_exc(file=sys.stdout)


print("Showing missing stream error on read_csv (python c)\n")
try:
    stream = test_pandas(False)
except Exception as exc:
    traceback.print_exc(file=sys.stdout)

@bdrosen96
Copy link
Contributor Author

I just ran this again on master and got same behavior.

Pandas version: 0.18.1+198.g3f6d4bd

@jreback
Copy link
Contributor

jreback commented Jul 14, 2016

pls show the output from master

@jreback
Copy link
Contributor

jreback commented Jul 14, 2016

again, using a non-decoded stream is really really odd; this is not supported

@jreback
Copy link
Contributor

jreback commented Jul 14, 2016

I'll reopen. If you can provide a PR which 'fixes' this I think it will be easier to look/test.

cc @gfyoung

@jreback jreback reopened this Jul 14, 2016
@bdrosen96
Copy link
Contributor Author


INSTALLED VERSIONS
------------------
commit: None
python: 3.4.3.final.0
python-bits: 64
OS: Linux
OS-release: 3.13.0-55-generic
machine: x86_64
processor: x86_64
byteorder: little
LC_ALL: None
LANG: en_US.UTF-8
LOCALE: en_US.UTF-8

pandas: 0.18.1+198.g3f6d4bd
nose: 1.3.7
pip: 7.1.0
setuptools: 20.2.2
Cython: 0.24.0a0
numpy: 1.9.2
scipy: 0.16.0
statsmodels: None
xarray: None
IPython: 4.1.2
sphinx: None
patsy: None
dateutil: 2.4.2
pytz: 2015.4
blosc: None
bottleneck: None
tables: None
numexpr: 2.4.3
matplotlib: None
openpyxl: None
xlrd: 0.9.4
xlwt: None
xlsxwriter: None
lxml: None
bs4: None
html5lib: 0.9999999
httplib2: None
apiclient: None
sqlalchemy: None
pymysql: None
psycopg2: None
jinja2: None
boto: 2.38.0
pandas_datareader: None
Pandas version: 0.18.1+198.g3f6d4bd

Python version: sys.version_info(major=3, minor=4, micro=3, releaselevel='final', serial=0)

Showing stream error on read

Traceback (most recent call last):
  File "pandas_bug.py", line 57, in <module>
    data = stream.read()
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/codecs.py", line 798, in read
    data = self.reader.read(size)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/codecs.py", line 497, in read
    newchars, decodedbytes = self.decode(data, self.errors)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x83 in position 12: invalid start byte
Showing stream error on read_csv (python engine)

Traceback (most recent call last):
  File "pandas_bug.py", line 64, in <module>
    stream = test_pandas(True)
  File "pandas_bug.py", line 51, in test_pandas
    df = pandas.read_csv(stream, engine=engine)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas-0.18.1+198.g3f6d4bd-py3.4-linux-x86_64.egg/pandas/io/parsers.py", line 631, in parser_f
    return _read(filepath_or_buffer, kwds)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas-0.18.1+198.g3f6d4bd-py3.4-linux-x86_64.egg/pandas/io/parsers.py", line 384, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas-0.18.1+198.g3f6d4bd-py3.4-linux-x86_64.egg/pandas/io/parsers.py", line 714, in __init__
    self._make_engine(self.engine)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas-0.18.1+198.g3f6d4bd-py3.4-linux-x86_64.egg/pandas/io/parsers.py", line 898, in _make_engine
    self._engine = klass(self.f, **self.options)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas-0.18.1+198.g3f6d4bd-py3.4-linux-x86_64.egg/pandas/io/parsers.py", line 1746, in __init__
    self.columns, self.num_original_columns = self._infer_columns()
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas-0.18.1+198.g3f6d4bd-py3.4-linux-x86_64.egg/pandas/io/parsers.py", line 1988, in _infer_columns
    line = self._buffered_line()
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas-0.18.1+198.g3f6d4bd-py3.4-linux-x86_64.egg/pandas/io/parsers.py", line 2140, in _buffered_line
    return self._next_line()
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas-0.18.1+198.g3f6d4bd-py3.4-linux-x86_64.egg/pandas/io/parsers.py", line 2171, in _next_line
    orig_line = next(self.data)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/codecs.py", line 820, in __next__
    data = next(self.reader)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/codecs.py", line 638, in __next__
    line = self.readline()
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/codecs.py", line 551, in readline
    data = self.read(readsize, firstline=True)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/codecs.py", line 497, in read
    newchars, decodedbytes = self.decode(data, self.errors)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x83 in position 0: invalid start byte
Showing missing stream error on read_csv (python c)

Traceback (most recent call last):
  File "pandas_bug.py", line 71, in <module>
    stream = test_pandas(False)
  File "pandas_bug.py", line 51, in test_pandas
    df = pandas.read_csv(stream, engine=engine)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas-0.18.1+198.g3f6d4bd-py3.4-linux-x86_64.egg/pandas/io/parsers.py", line 631, in parser_f
    return _read(filepath_or_buffer, kwds)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas-0.18.1+198.g3f6d4bd-py3.4-linux-x86_64.egg/pandas/io/parsers.py", line 384, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas-0.18.1+198.g3f6d4bd-py3.4-linux-x86_64.egg/pandas/io/parsers.py", line 714, in __init__
    self._make_engine(self.engine)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas-0.18.1+198.g3f6d4bd-py3.4-linux-x86_64.egg/pandas/io/parsers.py", line 892, in _make_engine
    self._engine = CParserWrapper(self.f, **self.options)
  File "/home/brett/.virtualenvs/datasets-service/lib/python3.4/site-packages/pandas-0.18.1+198.g3f6d4bd-py3.4-linux-x86_64.egg/pandas/io/parsers.py", line 1340, in __init__
    self._reader = _parser.TextReader(src, **kwds)
  File "pandas/parser.pyx", line 527, in pandas.parser.TextReader.__cinit__ (pandas/parser.c:5137)
  File "pandas/parser.pyx", line 701, in pandas.parser.TextReader._get_header (pandas/parser.c:7700)
  File "pandas/parser.pyx", line 898, in pandas.parser.TextReader._tokenize_rows (pandas/parser.c:10058)
  File "pandas/parser.pyx", line 1890, in pandas.parser.raise_parser_error (pandas/parser.c:24033)
pandas.io.common.CParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

@bdrosen96
Copy link
Contributor Author

diff --git a/pandas/parser.pyx b/pandas/parser.pyx
index 3928bc8..61a1e03 100644
--- a/pandas/parser.pyx
+++ b/pandas/parser.pyx
@@ -10,7 +10,9 @@ import warnings
 from csv import QUOTE_MINIMAL, QUOTE_NONNUMERIC, QUOTE_NONE
 from cpython cimport (PyObject, PyBytes_FromString,
                       PyBytes_AsString, PyBytes_Check,
-                      PyUnicode_Check, PyUnicode_AsUTF8String)
+                      PyUnicode_Check, PyUnicode_AsUTF8String,
+                      PyErr_Occurred, PyErr_Fetch)
+from cpython.ref cimport PyObject, Py_XDECREF
 from io.common import CParserError, DtypeWarning, EmptyDataError


@@ -1878,6 +1880,17 @@ cdef kh_float64_t* kset_float64_from_list(values) except NULL:


 cdef raise_parser_error(object base, parser_t *parser):
+    cdef:
+        object old_exc
+        PyObject *type, *value, *traceback
+    if PyErr_Occurred():
+        PyErr_Fetch(&type, &value, &traceback);
+        Py_XDECREF(type)
+        Py_XDECREF(traceback)
+        if value != NULL:
+            old_exc = <object> value
+            Py_XDECREF(value)
+            raise old_exc
     message = '%s. C error: ' % base
     if parser.error_msg != NULL:
         if PY3:

@gfyoung
Copy link
Member

gfyoung commented Jul 14, 2016

@bdrosen96 : thanks for pointing this out! You can submit a PR for this and make sure to include a test as well (I would think in common.py if possible but otherwise c_parser_only.py)!

@bdrosen96
Copy link
Contributor Author

I cannot submit a PR without creating a fork first (permissions issue)

@gfyoung
Copy link
Member

gfyoung commented Jul 14, 2016

Of course. It clearly says that in the documentation for contributing.

@bdrosen96
Copy link
Contributor Author

#13693

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Error Reporting Incorrect or improved errors from pandas IO CSV read_csv, to_csv Unicode Unicode strings
Projects
None yet
Development

Successfully merging a pull request may close this issue.

3 participants