Skip to content

Commit 21f8326

Browse files
committed
hdfs: Allow hdfs read/write for files
Now the following will work: If hdfs3 is not installed, Throws: ImportError: The hdfs3 library is required to handle hdfs files If hdfs3 is installed but libhdfs3 is not installed, Throws: ImportError: Can not find the shared library: libhdfs3.so If hdfs3 is installed it works for the code: pd.read_csv("hdfs://localhost:9000/tmp/a.csv") If hdfs3 is installed and HADOOP_CONF_DIR is set, it works for the code: HADOOP_CONF_DIR=/usr/local/Cellar/hadoop/2.7.0/libexec/etc/hadoop/ pd.read_csv("hdfs:///tmp/a.csv")
1 parent a9074db commit 21f8326

File tree

4 files changed

+58
-2
lines changed

4 files changed

+58
-2
lines changed

doc/source/whatsnew/v0.22.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ I/O
155155
^^^
156156

157157
- :func:`read_html` now rewinds seekable IO objects after parse failure, before attempting to parse with a new parser. If a parser errors and the object is non-seekable, an informative error is raised suggesting the use of a different parser (:issue:`17975`)
158+
- :func:`read_csv` now supports reading from hdfs by giving "hdfs:///tmp/data.csv". The hadoop configs will try to be automatically found. The configs can also be mentioned using the format "hdfs://nodenamehost:nodenameport/tmp/data.csv"
158159
- Bug in :func:`read_msgpack` with a non existent file is passed in Python 2 (:issue:`15296`)
159160
- Bug in :func:`read_csv` where a ``MultiIndex`` with duplicate columns was not being mangled appropriately (:issue:`18062`)
160161
- Bug in :func:`read_sas` where a file with 0 variables gave an ``AttributeError`` incorrectly. Now it gives an ``EmptyDataError`` (:issue:`18184`)

pandas/io/common.py

+22-2
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,14 @@ def _is_s3_url(url):
9999
return False
100100

101101

102+
def _is_hdfs_url(url):
103+
"""Check for an hdfs url"""
104+
try:
105+
return parse_url(url).scheme == 'hdfs'
106+
except:
107+
return False
108+
109+
102110
def _expand_user(filepath_or_buffer):
103111
"""Return the argument with an initial component of ~ or ~user
104112
replaced by that user's home directory.
@@ -201,6 +209,12 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
201209
encoding=encoding,
202210
compression=compression)
203211

212+
if _is_hdfs_url(filepath_or_buffer):
213+
from pandas.io import hdfs
214+
return hdfs.get_filepath_or_buffer(filepath_or_buffer,
215+
encoding=encoding,
216+
compression=compression)
217+
204218
if isinstance(filepath_or_buffer, (compat.string_types,
205219
compat.binary_type,
206220
mmap.mmap)):
@@ -314,11 +328,17 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
314328
handles : list of file-like objects
315329
A list of file-like object that were openned in this function.
316330
"""
331+
need_text_wrapping = [BytesIO]
317332
try:
318333
from s3fs import S3File
319-
need_text_wrapping = (BytesIO, S3File)
334+
need_text_wrapping.append(S3File)
335+
except ImportError:
336+
pass
337+
try:
338+
from hdfs3 import HDFile
339+
need_text_wrapping.append(HDFile)
320340
except ImportError:
321-
need_text_wrapping = (BytesIO,)
341+
pass
322342

323343
handles = list()
324344
f = path_or_buf

pandas/io/hdfs.py

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
""" s3 support for remote file interactivity """
2+
from pandas import compat
3+
try:
4+
import hdfs3
5+
except:
6+
raise ImportError("The hdfs3 library is required to handle hdfs files")
7+
8+
if compat.PY3:
9+
from urllib.parse import urlparse as parse_url
10+
else:
11+
from urlparse import urlparse as parse_url
12+
13+
14+
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
15+
compression=None):
16+
parsed_url = parse_url(filepath_or_buffer)
17+
if ":" in parsed_url.netloc:
18+
host, port = parsed_url.netloc.rsplit(":", 1)
19+
try:
20+
port = int(port)
21+
fs = hdfs3.HDFileSystem(host=host, port=port)
22+
except ValueError:
23+
pass
24+
else:
25+
fs = hdfs3.HDFileSystem()
26+
filepath_or_buffer = fs.open(parsed_url.path)
27+
return filepath_or_buffer, None, compression

pandas/tests/io/test_hdfs.py

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from pandas.io.common import _is_hdfs_url
2+
3+
4+
class TestHDFSURL(object):
5+
6+
def test_is_hdfs_url(self):
7+
assert _is_hdfs_url("hdfs://pandas/somethingelse.com")
8+
assert not _is_hdfs_url("hdf://pandas/somethingelse.com")

0 commit comments

Comments
 (0)