Skip to content

Commit a55e848

Browse files
committed
hdfs: Allow hdfs read/write for files
Now the following will work: If hdfs3 is not installed, Throws: ImportError: The hdfs3 library is required to handle hdfs files If hdfs3 is installed but libhdfs3 is not installed, Throws: ImportError: Can not find the shared library: libhdfs3.so If hdfs3 is installed it works for the code: pd.read_csv("hdfs://localhost:9000/tmp/a.csv") If hdfs3 is installed and HADOOP_CONF_DIR is set, it works for the code: HADOOP_CONF_DIR=/usr/local/Cellar/hadoop/2.7.0/libexec/etc/hadoop/ pd.read_csv("hdfs:///tmp/a.csv")
1 parent c28b624 commit a55e848

File tree

7 files changed

+67
-12
lines changed

7 files changed

+67
-12
lines changed

doc/source/io.rst

+8-2
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ Basic
7878

7979
filepath_or_buffer : various
8080
Either a path to a file (a :class:`python:str`, :class:`python:pathlib.Path`,
81-
or :class:`py:py._path.local.LocalPath`), URL (including http, ftp, and S3
81+
or :class:`py:py._path.local.LocalPath`), URL (including http, ftp, hdfs, and S3
8282
locations), or any object with a ``read()`` method (such as an open file or
8383
:class:`~python:io.StringIO`).
8484
sep : str, defaults to ``','`` for :func:`read_csv`, ``\t`` for :func:`read_table`
@@ -1579,6 +1579,12 @@ You can pass in a URL to a CSV file:
15791579
df = pd.read_csv('https://download.bls.gov/pub/time.series/cu/cu.item',
15801580
sep='\t')
15811581
1582+
Or a hdfs URL:
1583+
1584+
.. code-block:: python
1585+
1586+
df = pd.read_csv('hdfs://<nodenamehost>:<nodenameport>/pandas-test/tips.csv')
1587+
15821588
S3 URLs are handled as well:
15831589

15841590
.. code-block:: python
@@ -1849,7 +1855,7 @@ The parser will try to parse a ``DataFrame`` if ``typ`` is not supplied or
18491855
is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series``
18501856

18511857
- ``filepath_or_buffer`` : a **VALID** JSON string or file handle / StringIO. The string could be
1852-
a URL. Valid URL schemes include http, ftp, S3, and file. For file URLs, a host
1858+
a URL. Valid URL schemes include http, ftp, hdfs, S3, and file. For file URLs, a host
18531859
is expected. For instance, a local file could be
18541860
file ://localhost/path/to/table.json
18551861
- ``typ`` : type of object to recover (series or frame), default 'frame'

doc/source/whatsnew/v0.22.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ Other Enhancements
139139
- :func:`read_excel()` has gained the ``nrows`` parameter (:issue:`16645`)
140140
- :func:``DataFrame.to_json`` and ``Series.to_json`` now accept an ``index`` argument which allows the user to exclude the index from the JSON output (:issue:`17394`)
141141
- ``IntervalIndex.to_tuples()`` has gained the ``na_tuple`` parameter to control whether NA is returned as a tuple of NA, or NA itself (:issue:`18756`)
142+
- :func:`read_csv` now supports reading from hdfs by giving "hdfs:///tmp/data.csv". The hadoop configs will try to be automatically found. The configs can also be mentioned using the format "hdfs://<nodenamehost>:<nodenameport>/tmp/data.csv"
142143

143144
.. _whatsnew_0220.api_breaking:
144145

pandas/compat/__init__.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
PY36 = (sys.version_info >= (3, 6))
4545
PYPY = (platform.python_implementation() == 'PyPy')
4646

47-
try:
47+
try: # Python 2 imports
4848
import __builtin__ as builtins
4949
# not writeable when instantiated with string, doesn't handle unicode well
5050
from cStringIO import StringIO as cStringIO
@@ -53,12 +53,14 @@
5353
BytesIO = StringIO
5454
import cPickle
5555
import httplib
56-
except ImportError:
56+
from urlparse import urlparse as parse_url
57+
except ImportError: # Equivalent Python 3 imports
5758
import builtins
5859
from io import StringIO, BytesIO
5960
cStringIO = StringIO
6061
import pickle as cPickle
6162
import http.client as httplib
63+
from urllib.parse import urlparse as parse_url
6264

6365
from pandas.compat.chainmap import DeepChainMap
6466

pandas/io/common.py

+23-2
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,14 @@ def _is_s3_url(url):
9999
return False
100100

101101

102+
def _is_hdfs_url(url):
103+
"""Check for an hdfs url"""
104+
try:
105+
return parse_url(url).scheme == 'hdfs'
106+
except:
107+
return False
108+
109+
102110
def _expand_user(filepath_or_buffer):
103111
"""Return the argument with an initial component of ~ or ~user
104112
replaced by that user's home directory.
@@ -201,6 +209,12 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
201209
encoding=encoding,
202210
compression=compression)
203211

212+
if _is_hdfs_url(filepath_or_buffer):
213+
from pandas.io import hdfs
214+
return hdfs.get_filepath_or_buffer(filepath_or_buffer,
215+
encoding=encoding,
216+
compression=compression)
217+
204218
if isinstance(filepath_or_buffer, (compat.string_types,
205219
compat.binary_type,
206220
mmap.mmap)):
@@ -314,12 +328,19 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
314328
handles : list of file-like objects
315329
A list of file-like object that were openned in this function.
316330
"""
331+
need_text_wrapping = [BytesIO]
317332
try:
318333
from s3fs import S3File
319-
need_text_wrapping = (BytesIO, S3File)
334+
need_text_wrapping.append(S3File)
335+
except ImportError:
336+
pass
337+
try:
338+
from hdfs3 import HDFile
339+
need_text_wrapping.append(HDFile)
320340
except ImportError:
321-
need_text_wrapping = (BytesIO,)
341+
pass
322342

343+
need_text_wrapping = tuple(need_text_wrapping)
323344
handles = list()
324345
f = path_or_buf
325346

pandas/io/hdfs.py

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
""" s3 support for remote file interactivity """
2+
from pandas import compat
3+
try:
4+
import hdfs3
5+
except:
6+
raise ImportError("The hdfs3 library is required to handle hdfs files")
7+
8+
9+
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
10+
compression=None):
11+
parsed_url = compat.parse_url(filepath_or_buffer)
12+
if ":" in parsed_url.netloc:
13+
host, port = parsed_url.netloc.rsplit(":", 1)
14+
try:
15+
port = int(port)
16+
fs = hdfs3.HDFileSystem(host=host, port=port)
17+
except ValueError:
18+
pass
19+
else:
20+
fs = hdfs3.HDFileSystem()
21+
filepath_or_buffer = fs.open(parsed_url.path)
22+
return filepath_or_buffer, None, compression

pandas/io/s3.py

+1-6
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,10 @@
66
except:
77
raise ImportError("The s3fs library is required to handle s3 files")
88

9-
if compat.PY3:
10-
from urllib.parse import urlparse as parse_url
11-
else:
12-
from urlparse import urlparse as parse_url
13-
149

1510
def _strip_schema(url):
1611
"""Returns the url without the s3:// part"""
17-
result = parse_url(url)
12+
result = compat.parse_url(url)
1813
return result.netloc + result.path
1914

2015

pandas/tests/io/test_hdfs.py

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from pandas.io.common import _is_hdfs_url
2+
3+
4+
class TestHDFSURL(object):
5+
6+
def test_is_hdfs_url(self):
7+
assert _is_hdfs_url("hdfs://pandas/somethingelse.com")
8+
assert not _is_hdfs_url("hdf://pandas/somethingelse.com")

0 commit comments

Comments
 (0)