diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 039b24cc63217..eae60828a9e5e 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -25,32 +25,101 @@ New features - Added ``__fspath__`` method to :class:`~pandas.HDFStore`, :class:`~pandas.ExcelFile`, and :class:`~pandas.ExcelWriter` to work properly with the file system path protocol (:issue:`13823`) + +.. _whatsnew_0210.enhancements.infer_objects: + +``infer_objects`` type conversion +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :meth:`DataFrame.infer_objects` and :meth:`Series.infer_objects` +methods have been added to perform dtype inference on object columns, replacing +some of the functionality of the deprecated ``convert_objects`` +method. See the documentation :ref:`here ` +for more details. (:issue:`11221`) + +This method only performs soft conversions on object columns, converting Python objects +to native types, but not any coercive conversions. For example: + +.. ipython:: python + + df = pd.DataFrame({'A': [1, 2, 3], + 'B': np.array([1, 2, 3], dtype='object'), + 'C': ['1', '2', '3']}) + df.dtypes + df.infer_objects().dtypes + +Note that column ``'C'`` was not converted - only scalar numeric types +will be inferred to a new type. Other types of conversion should be accomplished +using the :func:`to_numeric` function (or :func:`to_datetime`, :func:`to_timedelta`). + +.. ipython:: python + + df = df.infer_objects() + df['C'] = pd.to_numeric(df['C'], errors='coerce') + df.dtypes + .. _whatsnew_0210.enhancements.other: Other Enhancements ^^^^^^^^^^^^^^^^^^ - The ``validate`` argument for :func:`merge` function now checks whether a merge is one-to-one, one-to-many, many-to-one, or many-to-many. If a merge is found to not be an example of specified merge type, an exception of type ``MergeError`` will be raised. For more, see :ref:`here ` (:issue:`16270`) -- ``Series.to_dict()`` and ``DataFrame.to_dict()`` now support an ``into`` keyword which allows you to specify the ``collections.Mapping`` subclass that you would like returned. The default is ``dict``, which is backwards compatible. (:issue:`16122`) -- ``RangeIndex.append`` now returns a ``RangeIndex`` object when possible (:issue:`16212`) -- ``Series.rename_axis()`` and ``DataFrame.rename_axis()`` with ``inplace=True`` now return ``None`` while renaming the axis inplace. (:issue:`15704`) -- :func:`to_pickle` has gained a ``protocol`` parameter (:issue:`16252`). By default, this parameter is set to `HIGHEST_PROTOCOL `__ +- :func:`Series.to_dict` and :func:`DataFrame.to_dict` now support an ``into`` keyword which allows you to specify the ``collections.Mapping`` subclass that you would like returned. The default is ``dict``, which is backwards compatible. (:issue:`16122`) +- :func:`RangeIndex.append` now returns a ``RangeIndex`` object when possible (:issue:`16212`) +- :func:`Series.rename_axis` and :func:`DataFrame.rename_axis` with ``inplace=True`` now return ``None`` while renaming the axis inplace. (:issue:`15704`) +- :func:`Series.to_pickle` and :func:`DataFrame.to_pickle` have gained a ``protocol`` parameter (:issue:`16252`). By default, this parameter is set to `HIGHEST_PROTOCOL `__ - :func:`api.types.infer_dtype` now infers decimals. (:issue:`15690`) - :func:`read_feather` has gained the ``nthreads`` parameter for multi-threaded operations (:issue:`16359`) - :func:`DataFrame.clip()` and :func:`Series.clip()` have gained an ``inplace`` argument. (:issue:`15388`) - :func:`crosstab` has gained a ``margins_name`` parameter to define the name of the row / column that will contain the totals when ``margins=True``. (:issue:`15972`) -- :func:`Dataframe.select_dtypes` now accepts scalar values for include/exclude as well as list-like. (:issue:`16855`) +- :func:`DataFrame.select_dtypes` now accepts scalar values for include/exclude as well as list-like. (:issue:`16855`) +- :func:`date_range` now accepts 'YS' in addition to 'AS' as an alias for start of year (:issue:`9313`) +- :func:`date_range` now accepts 'Y' in addition to 'A' as an alias for end of year (:issue:`9313`) + +.. _whatsnew_0210.enhancements.read_csv: + +``read_csv`` allow basic auth and skip SSL verification +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +It is possible to read data (i.e. CSV, JSON, HTML) from a URL that is password-protected (:issue:`16716`) +The :meth:`DataFrame.read_csv`, :meth:`DataFrame.read_html`, :meth:`DataFrame.read_json`, +:meth:`DataFrame.read_excel` can now perform Basic Authentication over HTTP/HTTPS +now accept an optional parameter of ``auth`` of type tuple containing username and password +It also fixes an issue where a url containing username and password in the standard form now works: +https://:@:/ +Use of username and password over HTTP (without ssl) is a security threat and not recommended. + +.. ipython:: python + url = 'http://:/my.csv' + username = 'darth' + password = 'cand3stroypassword' + df = pd.read_csv(url, auth=(username, password)) + + # or + df = pd.read_csv('https://darth:cand3stroypassword@myhost.com:1010/my.csv') + +It is now also possible to bypass verification of SSL certificate for HTTPS call. +The :meth:`DataFrame.read_csv`, :meth:`DataFrame.read_html`, :meth:`DataFrame.read_json`, +:meth:`DataFrame.read_excel` can now accept an optional parameter ``verify_ssl=False`` to bypass SSL verification. +Doing this creates a security risk and is not recommended. It will raise a warning `pandas.io.common.InsecureRequestWarning` +This functionality is added to primarily help in testability scenarios or within private networks where obtaining SSL certificates is burdensome. + +.. ipython:: python + url = 'https://my-test-server.internal/my.csv' + df = pd.read_csv(url, verify_ssl=False) # succeed with pandas.io.common.InsecureRequestWarning + + df = pd.read_csv(url) # Fails by default .. _whatsnew_0210.api_breaking: Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. _whatsnew_0210.api_breaking.pandas_eval: + Improved error handling during item assignment in pd.eval ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. _whatsnew_0210.api_breaking.pandas_eval: - :func:`eval` will now raise a ``ValueError`` when item assignment malfunctions, or inplace operations are specified, but there is no item assignment in the expression (:issue:`16732`) @@ -92,9 +161,14 @@ the target. Now, a ``ValueError`` will be raised when such an input is passed in ... ValueError: Cannot operate inplace if there is no assignment +.. _whatsnew_0210.api: + +Other API Changes +^^^^^^^^^^^^^^^^^ + - Support has been dropped for Python 3.4 (:issue:`15251`) - The Categorical constructor no longer accepts a scalar for the ``categories`` keyword. (:issue:`16022`) -- Accessing a non-existent attribute on a closed :class:`HDFStore` will now +- Accessing a non-existent attribute on a closed :class:`~pandas.HDFStore` will now raise an ``AttributeError`` rather than a ``ClosedFileError`` (:issue:`16301`) - :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`) - :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`) @@ -102,12 +176,6 @@ the target. Now, a ``ValueError`` will be raised when such an input is passed in - Compression defaults in HDF stores now follow pytable standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`) - ``Index.get_indexer_non_unique()`` now returns a ndarray indexer rather than an ``Index``; this is consistent with ``Index.get_indexer()`` (:issue:`16819`) - Removed the ``@slow`` decorator from ``pandas.util.testing``, which caused issues for some downstream packages' test suites. Use ``@pytest.mark.slow`` instead, which achieves the same thing (:issue:`16850`) - -.. _whatsnew_0210.api: - -Other API Changes -^^^^^^^^^^^^^^^^^ - - Moved definition of ``MergeError`` to the ``pandas.errors`` module. @@ -117,6 +185,7 @@ Deprecations ~~~~~~~~~~~~ - :func:`read_excel()` has deprecated ``sheetname`` in favor of ``sheet_name`` for consistency with ``.to_excel()`` (:issue:`10559`). +- ``pd.options.html.border`` has been deprecated in favor of ``pd.options.display.html.border`` (:issue:`15793`). .. _whatsnew_0210.prior_deprecations: @@ -124,10 +193,12 @@ Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - :func:`read_excel()` has dropped the ``has_index_names`` parameter (:issue:`10967`) +- The ``pd.options.display.height`` configuration has been dropped (:issue:`3663`) +- The ``pd.options.display.line_width`` configuration has been dropped (:issue:`2881`) - The ``pd.options.display.mpl_style`` configuration has been dropped (:issue:`12190`) - ``Index`` has dropped the ``.sym_diff()`` method in favor of ``.symmetric_difference()`` (:issue:`12591`) - ``Categorical`` has dropped the ``.order()`` and ``.sort()`` methods in favor of ``.sort_values()`` (:issue:`12882`) -- :func:`eval` and :method:`DataFrame.eval` have changed the default of ``inplace`` from ``None`` to ``False`` (:issue:`11149`) +- :func:`eval` and :func:`DataFrame.eval` have changed the default of ``inplace`` from ``None`` to ``False`` (:issue:`11149`) - The function ``get_offset_name`` has been dropped in favor of the ``.freqstr`` attribute for an offset (:issue:`11834`) @@ -136,6 +207,7 @@ Removal of prior version deprecations/changes Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Improved performance of instantiating :class:`SparseDataFrame` (:issue:`16773`) .. _whatsnew_0210.bug_fixes: @@ -143,7 +215,6 @@ Performance Improvements Bug Fixes ~~~~~~~~~ -- Fixes regression in 0.20, :func:`Series.aggregate` and :func:`DataFrame.aggregate` allow dictionaries as return values again (:issue:`16741`) Conversion ^^^^^^^^^^ @@ -155,34 +226,41 @@ Indexing - When called with a null slice (e.g. ``df.iloc[:]``), the ``.iloc`` and ``.loc`` indexers return a shallow copy of the original object. Previously they returned the original object. (:issue:`13873`). - When called on an unsorted ``MultiIndex``, the ``loc`` indexer now will raise ``UnsortedIndexError`` only if proper slicing is used on non-sorted levels (:issue:`16734`). - Fixes regression in 0.20.3 when indexing with a string on a ``TimedeltaIndex`` (:issue:`16896`). +- Fixed :func:`TimedeltaIndex.get_loc` handling of ``np.timedelta64`` inputs (:issue:`16909`). +- Fix :func:`MultiIndex.sort_index` ordering when ``ascending`` argument is a list, but not all levels are specified, or are in a different order (:issue:`16934`). +- Fixes bug where indexing with ``np.inf`` caused an ``OverflowError`` to be raised (:issue:`16957`) +- Bug in reindexing on an empty ``CategoricalIndex`` (:issue:`16770`) I/O ^^^ - Bug in :func:`read_csv` in which non integer values for the header argument generated an unhelpful / unrelated error message (:issue:`16338`) +- Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`) Plotting ^^^^^^^^ - +- Bug in plotting methods using ``secondary_y`` and ``fontsize`` not setting secondary axis font size (:issue:`12565`) Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug in ``DataFrame.resample().size()`` where an empty ``DataFrame`` did not return a ``Series`` (:issue:`14962`) -- Bug in ``infer_freq`` causing indices with 2-day gaps during the working week to be wrongly inferred as business daily (:issue:`16624`) -- Bug in ``.rolling.quantile()`` which incorrectly used different defaults than :func:`Series.quantile()` and :func:`DataFrame.quantile()` (:issue:`9413`, :issue:`16211`) - +- Bug in ``DataFrame.resample(...).size()`` where an empty ``DataFrame`` did not return a ``Series`` (:issue:`14962`) +- Bug in :func:`infer_freq` causing indices with 2-day gaps during the working week to be wrongly inferred as business daily (:issue:`16624`) +- Bug in ``.rolling(...).quantile()`` which incorrectly used different defaults than :func:`Series.quantile()` and :func:`DataFrame.quantile()` (:issue:`9413`, :issue:`16211`) +- Bug in ``groupby.transform()`` that would coerce boolean dtypes back to float (:issue:`16875`) Sparse ^^^^^^ - +- Bug in ``SparseSeries`` raises ``AttributeError`` when a dictionary is passed in as data (:issue:`16777`) Reshaping ^^^^^^^^^ - +- Joining/Merging with a non unique ``PeriodIndex`` raised a TypeError (:issue:`16871`) +- Bug when using :func:`isin` on a large object series and large comparison array (:issue:`16012`) +- Fixes regression from 0.20, :func:`Series.aggregate` and :func:`DataFrame.aggregate` allow dictionaries as return values again (:issue:`16741`) Numeric @@ -192,9 +270,10 @@ Numeric Categorical ^^^^^^^^^^^ -- Bug in ``:func:Series.isin()`` when called with a categorical (:issue`16639`) +- Bug in :func:`Series.isin` when called with a categorical (:issue`16639`) Other ^^^^^ - Bug in :func:`eval` where the ``inplace`` parameter was being incorrectly handled (:issue:`16732`) +- Bug in ``.isin()`` in which checking membership in empty ``Series`` objects raised an error (:issue:`16991`) diff --git a/pandas/io/common.py b/pandas/io/common.py index cbfc33dbebb81..0c06cc209252f 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -4,6 +4,9 @@ import csv import codecs import mmap +import ssl +import base64 +import warnings from contextlib import contextmanager, closing from pandas.compat import StringIO, BytesIO, string_types, text_type @@ -49,7 +52,7 @@ if compat.PY3: - from urllib.request import urlopen, pathname2url + from urllib.request import urlopen, pathname2url, Request _urlopen = urlopen from urllib.parse import urlparse as parse_url from urllib.parse import (uses_relative, uses_netloc, uses_params, @@ -58,6 +61,7 @@ from http.client import HTTPException # noqa else: from urllib2 import urlopen as _urlopen + from urllib2 import Request from urllib import urlencode, pathname2url # noqa from urlparse import urlparse as parse_url from urlparse import uses_relative, uses_netloc, uses_params, urljoin @@ -177,7 +181,8 @@ def _stringify_path(filepath_or_buffer): def get_filepath_or_buffer(filepath_or_buffer, encoding=None, - compression=None): + compression=None, auth=None, + verify_ssl=None): """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. @@ -185,9 +190,29 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, Parameters ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), - or buffer + or buffer now supports url with username and password + eg: 'https://:@:/' + + .. versionadded:: 0.21.0 + encoding : the encoding to use to decode py3 bytes, default is 'utf-8' + compression : string, default None + + .. versionadded:: 0.18.1 + + auth : tuple, default None + A tuple of string with (username, password) string for + HTTP(s) basic auth: eg auth= ('roberto', 'panda$4life') + + .. versionadded:: 0.21.0 + + verify_ssl : boolean, Default True + If False, allow self signed and invalid SSL certificates for https + + .. versionadded:: 0.21.0 + + Returns ------- a filepath_or_buffer, the encoding, the compression @@ -195,7 +220,10 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, filepath_or_buffer = _stringify_path(filepath_or_buffer) if _is_url(filepath_or_buffer): - req = _urlopen(filepath_or_buffer) + ureq, kwargs = get_urlopen_args(filepath_or_buffer, + auth=auth, + verify_ssl=verify_ssl) + req = _urlopen(ureq, **kwargs) content_encoding = req.headers.get('Content-Encoding', None) if content_encoding == 'gzip': # Override compression based on Content-Encoding header @@ -244,6 +272,108 @@ def file_path_to_url(path): } +class InsecureRequestWarning(Warning): + "Warned when making an unverified HTTPS request. Borrowed from requests" + pass + + +def split_auth_from_url(url_with_uname): + """ + If a url contains username and password, it is extracted and returned + along with a url that does not contain it. + + Parameters + ---------- + url_with_uname : string + a url that may or may not contain username and password + see section 3.1 RFC 1738 https://www.ietf.org/rfc/rfc1738.txt + //:@:/ + + .. versionadded:: 0.21.0 + + Returns + ------- + (username, password), url_no_usrpwd : tuple, string Default ('', '') url + A tuple with (username, pwd) pair and + url without username or password (if it contained it ) + + Raises + ------ + ValueError for empty url + """ + if not url_with_uname: + msg = "Empty url: {_type}" + raise ValueError(msg.format(_type=type(url_with_uname))) + o = parse_url(url_with_uname) + uname = o.username if o.username else '' + pwd = o.password if o.password else '' + url_no_usrpwd = url_with_uname + if uname or pwd: + usrch = '{}:{}@{}'.format(o.username, o.password, o.hostname) + url_no_usrpwd = url_with_uname.replace(usrch, o.hostname) + return (uname, pwd), url_no_usrpwd + + +def get_urlopen_args(url_with_uname, auth=None, verify_ssl=True): + """ + generate args to pass to urlopen - including basic auth and and support + for disabling verification of SSL certificates ( useful where + self-signed SSL certificates are acceptable security risk -eg: Testing ) + + Parameters + ---------- + url_with_uname : string + a url that may or may not contain username and password + see section 3.1 RFC 1738 https://www.ietf.org/rfc/rfc1738.txt + //:@:/ + + .. versionadded:: 0.21.0 + + auth : tuple, default None + A tuple of string with (username, password) string for + HTTP(s) basic auth: eg auth= ('roberto', 'panda$4life') + + .. versionadded:: 0.21.0 + + verify_ssl : boolean, Default True + If False, allow self signed and invalid SSL certificates for https + + .. versionadded:: 0.21.0 + + Returns + ------- + Request, kwargs to pass to urlopen. kwargs may be {} or {'context': obj } + + Raises + ------ + ValueError if only one of username or password is provided. + """ + uname = pwd = None + url_no_usrpwd = url_with_uname + if auth and len(auth) == 2: + uname, pwd = auth + if not uname and not pwd: + (uname, pwd), url_no_usrpwd = split_auth_from_url(url_with_uname) + req = Request(url_no_usrpwd) + if uname and pwd: + upstr = '{}:{}'.format(uname, pwd) + if compat.PY3: + b64str = base64.b64encode(bytes(upstr, 'ascii')).decode('utf-8') + else: + b64str = base64.encodestring(upstr).replace('\n', '') + req.add_header("Authorization", "Basic {}".format(b64str)) + elif uname or pwd: + msg = 'Only username or password provided without the other' + raise ValueError(msg) + kwargs = {} + if verify_ssl is False and url_no_usrpwd.lower().startswith('https://'): + kwargs['context'] = ssl._create_unverified_context() + msg = 'SSL certificate verification is being disabled for HTTPS.' + \ + ' Possible security risk' + warnings.warn(msg, InsecureRequestWarning) + return req, kwargs + + def _infer_compression(filepath_or_buffer, compression): """ Get the compression method for filepath_or_buffer. If compression='infer', diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 5db4603c37be0..62a8cc670bd60 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -20,7 +20,7 @@ from pandas.errors import EmptyDataError from pandas.io.common import (_is_url, _urlopen, _validate_header_arg, get_filepath_or_buffer, _NA_VALUES, - _stringify_path) + _stringify_path, get_urlopen_args) from pandas.core.indexes.period import Period import pandas._libs.json as json from pandas.compat import (map, zip, reduce, range, lrange, u, add_metaclass, @@ -200,7 +200,6 @@ def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0, convert_float=True, converters=None, dtype=None, true_values=None, false_values=None, engine=None, squeeze=False, **kwds): - # Can't use _deprecate_kwarg since sheetname=None has a special meaning if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds: warnings.warn("The `sheetname` keyword is deprecated, use " @@ -211,7 +210,10 @@ def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0, "Use just `sheet_name`") if not isinstance(io, ExcelFile): - io = ExcelFile(io, engine=engine) + io = ExcelFile(io, + engine=engine, + auth=kwds.get('auth', None), + verify_ssl=kwds.get('verify_ssl', None)) return io._parse_excel( sheetname=sheet_name, header=header, skiprows=skiprows, names=names, @@ -259,7 +261,11 @@ def __init__(self, io, **kwds): # If io is a url, want to keep the data as bytes so can't pass # to get_filepath_or_buffer() if _is_url(self._io): - io = _urlopen(self._io) + verify_ssl = kwds.get('verify_ssl', None) + ureq, kwargs = get_urlopen_args(self._io, + auth=kwds.get('auth', None), + verify_ssl=verify_ssl) + io = _urlopen(ureq, **kwargs) elif not isinstance(self.io, (ExcelFile, xlrd.Book)): io, _, _ = get_filepath_or_buffer(self._io) diff --git a/pandas/io/html.py b/pandas/io/html.py index 2613f26ae5f52..62bca65c70427 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -15,7 +15,8 @@ from pandas.core.dtypes.common import is_list_like from pandas.errors import EmptyDataError from pandas.io.common import (_is_url, urlopen, - parse_url, _validate_header_arg) + parse_url, _validate_header_arg, + get_urlopen_args) from pandas.io.parsers import TextParser from pandas.compat import (lrange, lmap, u, string_types, iteritems, raise_with_traceback, binary_type) @@ -116,19 +117,29 @@ def _get_skiprows(skiprows): type(skiprows).__name__) -def _read(obj): +def _read(obj, auth=None, verify_ssl=None): """Try to read from a url, file or string. Parameters ---------- obj : str, unicode, or file-like + auth : tuple, default None + A tuple of string with (username, password) string for + HTTP(s) basic auth: eg auth= ('roberto', 'panda$4life') + .. versionadded:: 0.21.0 + + verify_ssl : boolean, Default True + If False, allow self signed and invalid SSL certificates for https + + .. versionadded:: 0.21.0 Returns ------- raw_text : str """ if _is_url(obj): - with urlopen(obj) as url: + ureq, kwargs = get_urlopen_args(obj, auth, verify_ssl) + with urlopen(ureq, **kwargs) as url: text = url.read() elif hasattr(obj, 'read'): text = obj.read() @@ -187,11 +198,14 @@ class _HtmlFrameParser(object): functionality. """ - def __init__(self, io, match, attrs, encoding): + def __init__(self, io, match, attrs, encoding, auth=None, + verify_ssl=None): self.io = io self.match = match self.attrs = attrs self.encoding = encoding + self.auth = auth + self.verify_ssl = verify_ssl def parse_tables(self): tables = self._parse_tables(self._build_doc(), self.match, self.attrs) @@ -444,7 +458,7 @@ def _parse_tables(self, doc, match, attrs): return result def _setup_build_doc(self): - raw_text = _read(self.io) + raw_text = _read(self.io, self.auth, self.verify_ssl) if not raw_text: raise ValueError('No text parsed from document: %s' % self.io) return raw_text @@ -731,8 +745,11 @@ def _parse(flavor, io, match, attrs, encoding, **kwargs): retained = None for flav in flavor: parser = _parser_dispatch(flav) - p = parser(io, compiled_match, attrs, encoding) - + p = parser(io, compiled_match, + attrs, + encoding, + auth=kwargs.get('auth', None), + verify_ssl=kwargs.get('verify_ssl', None)) try: tables = p.parse_tables() except Exception as caught: @@ -755,7 +772,8 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, skiprows=None, attrs=None, parse_dates=False, tupleize_cols=False, thousands=',', encoding=None, decimal='.', converters=None, na_values=None, - keep_default_na=True): + keep_default_na=True, auth=None, + verify_ssl=False): r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters @@ -856,7 +874,18 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, .. versionadded:: 0.19.0 - Returns + auth : tuple, default None + A tuple of string with (username, password) string for + HTTP(s) basic auth: eg auth= ('roberto', 'panda$4life') + + .. versionadded:: 0.21.0 + + verify_ssl : boolean, Default True + If False, allow self signed and invalid SSL certificates for https + + .. versionadded:: 0.21.0 + + Returns ------- dfs : list of DataFrames @@ -903,4 +932,5 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, parse_dates=parse_dates, tupleize_cols=tupleize_cols, thousands=thousands, attrs=attrs, encoding=encoding, decimal=decimal, converters=converters, na_values=na_values, - keep_default_na=keep_default_na) + keep_default_na=keep_default_na, auth=auth, + verify_ssl=verify_ssl) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 31907ad586817..b403eb45ae2cd 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -174,7 +174,7 @@ def write(self): def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None, encoding=None, - lines=False): + lines=False, auth=None, verify_ssl=None): """ Convert a JSON string to pandas object @@ -263,6 +263,17 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, .. versionadded:: 0.19.0 + auth : tuple, default None + A tuple of string with (username, password) string for + HTTP(s) basic auth: eg auth= ('roberto', 'panda$4life') + + .. versionadded:: 0.21.0 + + verify_ssl : boolean, Default True + If False, allow self signed and invalid SSL certificates for https + + .. versionadded:: 0.21.0 + Returns ------- result : Series or DataFrame, depending on the value of `typ`. @@ -321,7 +332,9 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, """ filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf, - encoding=encoding) + encoding=encoding, + auth=auth, + verify_ssl=verify_ssl) if isinstance(filepath_or_buffer, compat.string_types): try: exists = os.path.exists(filepath_or_buffer) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 343bc7a74fde8..5a85a2bcefa61 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -391,9 +391,12 @@ def _read(filepath_or_buffer, kwds): kwds['encoding'] = encoding compression = kwds.get('compression') + auth = kwds.get('auth', None) + verify_ssl = kwds.get('verify_ssl', None) compression = _infer_compression(filepath_or_buffer, compression) filepath_or_buffer, _, compression = get_filepath_or_buffer( - filepath_or_buffer, encoding, compression) + filepath_or_buffer, encoding, compression, auth, + verify_ssl) kwds['compression'] = compression if kwds.get('date_parser', None) is not None: @@ -574,7 +577,11 @@ def parser_f(filepath_or_buffer, low_memory=_c_parser_defaults['low_memory'], buffer_lines=None, memory_map=False, - float_precision=None): + float_precision=None, + + auth=None, + + verify_ssl=None): # Alias sep -> delimiter. if delimiter is None: @@ -654,7 +661,11 @@ def parser_f(filepath_or_buffer, mangle_dupe_cols=mangle_dupe_cols, tupleize_cols=tupleize_cols, infer_datetime_format=infer_datetime_format, - skip_blank_lines=skip_blank_lines) + skip_blank_lines=skip_blank_lines, + + auth=auth, + verify_ssl=verify_ssl + ) return _read(filepath_or_buffer, kwds) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index b527e3c5dc254..82fc4d6271f00 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -190,6 +190,24 @@ def test_write_fspath_hdf5(self): tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize('url, uname, pwd, nurl', [ + ('https://a1:b1@cc.com:101/f.csv', + 'a1', + 'b1', + 'https://cc.com:101/f.csv' + ), + ('https://ccc.com:1010/aaa.txt', + '', + '', + 'https://ccc.com:1010/aaa.txt' + ), + ]) + def test_split_url_extract_uname_pwd(self, url, uname, pwd, nurl): + (un, pw), ur = common.split_auth_from_url(url) + assert ur == nurl + assert un == uname + assert pw == pwd + class TestMMapWrapper(object): diff --git a/pandas/tests/io/test_http_auth.py b/pandas/tests/io/test_http_auth.py new file mode 100644 index 0000000000000..26c9e0cf63ce5 --- /dev/null +++ b/pandas/tests/io/test_http_auth.py @@ -0,0 +1,152 @@ +import pytest +from pandas.io.common import InsecureRequestWarning +import pandas as pd +import pandas.util.testing as tm +try: + from urllib2 import HTTPError +except: + from urllib.error import HTTPError + +uname = 'pandasusr' +pwd = 'pandaspwd' +no_auth_path = 'no_auth/' +basic_auth_path = 'basic_auth/' +valid_ssl_url = 'handsome-equator.000webhostapp.com' +invalid_ssl_url = 'pandas-unittest.site11.com' + + +def gen_http_auth_ssl_test_cases(uname, + pwd, + is_auth, + sub_path): + """ + Generate list of test case to test for : http/https, username/pwd in url + or as parameters, self signed ssl certs or trusted ssl certs, no auth + or basic auth + """ + def gen_level1_tc(): + test_cases = [] + # The following host doesn't seem to handle urllib but handles + # python requests package. This is because: + # 'urlopen' sets header 'Host' : ':' - acceptable RFC7230 + # 'requests' sets header 'Host' : '' + # so pandas fails on following hosting server (uses some 'apex' server) + # but pandas succeeds on nginx even if port is non-default. + for host, verify_ssl in [(invalid_ssl_url, False), + (valid_ssl_url, True) + ]: + pre_ports = [('http', ''), + ('https', '')] + for pre, port in pre_ports: + test_cases.append( + [host, verify_ssl, pre, port, sub_path, is_auth]) + return test_cases + + def gen_base_url(pre, auth_prefix, host, port, su_pa): + return '{}://{}{}{}/{}'.format(pre, auth_prefix, host, port, su_pa) + tc2 = [] + for host, verify_ssl, pre, port, sp, is_auth in gen_level1_tc(): + u = uname if is_auth else None + p = pwd if is_auth else None + u_no_uname = gen_base_url(pre, '', host, port, sp) + u_with_uname = None + if is_auth: + auth_prefix = '{}:{}@'.format(u, p) if is_auth else '' + u_with_uname = gen_base_url(pre, auth_prefix, host, port, sp) + tc2.append([u_no_uname, u, p, verify_ssl]) + if u_with_uname and u_with_uname != u_no_uname: + tc2.append([u_with_uname, None, None, verify_ssl]) + else: + tc2.append([u_no_uname, None, None, verify_ssl]) + return tc2 + + +valid_no_auth = gen_http_auth_ssl_test_cases(uname='', + pwd='', + is_auth=False, + sub_path=no_auth_path) + +valid_auth = gen_http_auth_ssl_test_cases(uname=uname, + pwd=pwd, + is_auth=True, + sub_path=basic_auth_path) + + +@pytest.mark.slow +@pytest.mark.parametrize('url, uname, pwd, verify_ssl', + valid_no_auth + valid_auth) +def test_http_valid_auth(url, uname, pwd, verify_ssl): + check_http_auth(url, uname, pwd, verify_ssl) + + +wrong_auth = gen_http_auth_ssl_test_cases(uname='fakepwd', + pwd='fakepwd', + is_auth=True, + sub_path=basic_auth_path) + + +@pytest.mark.slow +@pytest.mark.parametrize('url, uname, pwd, verify_ssl', + wrong_auth) +def test_http_invalid_auth(url, uname, pwd, verify_ssl): + with pytest.raises(HTTPError): + check_http_auth(url, uname, pwd, verify_ssl) + + +blank_uname = gen_http_auth_ssl_test_cases(uname='', + pwd='fakepwd', + is_auth=True, + sub_path=basic_auth_path) + +blank_pwd = gen_http_auth_ssl_test_cases(uname='fakepwd', + pwd='', + is_auth=True, + sub_path=basic_auth_path) + + +@pytest.mark.slow +@pytest.mark.parametrize('url, uname, pwd, verify_ssl', + blank_uname + blank_pwd) +def test_http_require_uname_and_pwd(url, uname, pwd, verify_ssl): + with pytest.raises(ValueError): + check_http_auth(url, uname, pwd, verify_ssl) + + +@tm.network +def check_http_auth(url, uname, pwd, verify_ssl): + + def get_df(url, uname, pwd, verify_ssl, pd_read_fn, fname): + furl = url + fname + kwargs = {} + if uname or pwd: + kwargs['auth'] = (uname, pwd) + if verify_ssl is not None: + kwargs['verify_ssl'] = verify_ssl + msg = '{0: <90} -- auth:[{1: <10}/{2: <10}] v:[{3: <5}]'.format( + furl, str(uname), str(pwd), str(verify_ssl)) + tcsv = 'animal,bird\ndog,pigeon\ncat,emu\n' + j = '{"animal":{"0":"dog","1":"cat"},"bird":{"0":"pigeon","1":"emu"}}' + if verify_ssl or furl.lower().startswith('http://'): + df = pd_read_fn(furl, **kwargs) + else: + with tm.assert_produces_warning(InsecureRequestWarning): + df = pd_read_fn(furl, **kwargs) + if type(df) is list: # html + df = df[0] + smatch = str(df.to_csv(index=False)) == tcsv + jmatch = str(df.to_json()) == j + res = 'Json : {} -- String: {}'.format(jmatch, smatch) + if not jmatch or not smatch: + raise Exception(' ** ERROR:' + res) + else: + res += ' OK' + print(msg + ' ' + res) + return True + + for pd_read_fn, fname in [(pd.read_csv, 'aaa.csv'), + (pd.read_json, 'jdoc.json'), + (pd.read_excel, 'ex_doc.xlsx'), + (pd.read_html, 'html_file.html') + ]: + assert get_df(url, uname, pwd, verify_ssl, pd_read_fn, fname) + return