Merge remote-tracking branch 'upstream/master' into gilbertoolimpio-DOCS---Ambiguous-description-in-pandas.DataFrame.to_parquet-documentation-pandas-dev#19662

TomAugspurger · TomAugspurger · commit 6460364bea60 · 2018-02-13T05:45:07.000-06:00
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3944,17 +3944,7 @@ def _combine_frame(self, other, func, fill_value=None, level=None):
         new_index, new_columns = this.index, this.columns
 
         def _arith_op(left, right):
-            if fill_value is not None:
-                left_mask = isna(left)
-                right_mask = isna(right)
-                left = left.copy()
-                right = right.copy()
-
-                # one but not both
-                mask = left_mask ^ right_mask
-                left[left_mask & mask] = fill_value
-                right[right_mask & mask] = fill_value
-
+            left, right = ops.fill_binop(left, right, fill_value)
             return func(left, right)
 
         if this._is_mixed_type or other._is_mixed_type:
diff --git a/pandas/core/ops.py b/pandas/core/ops.py
@@ -398,6 +398,79 @@ def _make_flex_doc(op_name, typ):
     return doc
 
 
+# -----------------------------------------------------------------------------
+# Masking NA values and fallbacks for operations numpy does not support
+
+def fill_binop(left, right, fill_value):
+    """
+    If a non-None fill_value is given, replace null entries in left and right
+    with this value, but only in positions where _one_ of left/right is null,
+    not both.
+
+    Parameters
+    ----------
+    left : array-like
+    right : array-like
+    fill_value : object
+
+    Returns
+    -------
+    left : array-like
+    right : array-like
+
+    Notes
+    -----
+    Makes copies if fill_value is not None
+    """
+    # TODO: can we make a no-copy implementation?
+    if fill_value is not None:
+        left_mask = isna(left)
+        right_mask = isna(right)
+        left = left.copy()
+        right = right.copy()
+
+        # one but not both
+        mask = left_mask ^ right_mask
+        left[left_mask & mask] = fill_value
+        right[right_mask & mask] = fill_value
+    return left, right
+
+
+def mask_cmp_op(x, y, op, allowed_types):
+    """
+    Apply the function `op` to only non-null points in x and y.
+
+    Parameters
+    ----------
+    x : array-like
+    y : array-like
+    op : binary operation
+    allowed_types : class or tuple of classes
+
+    Returns
+    -------
+    result : ndarray[bool]
+    """
+    # TODO: Can we make the allowed_types arg unnecessary?
+    xrav = x.ravel()
+    result = np.empty(x.size, dtype=bool)
+    if isinstance(y, allowed_types):
+        yrav = y.ravel()
+        mask = notna(xrav) & notna(yrav)
+        result[mask] = op(np.array(list(xrav[mask])),
+                          np.array(list(yrav[mask])))
+    else:
+        mask = notna(xrav)
+        result[mask] = op(np.array(list(xrav[mask])), y)
+
+    if op == operator.ne:  # pragma: no cover
+        np.putmask(result, ~mask, True)
+    else:
+        np.putmask(result, ~mask, False)
+    result = result.reshape(x.shape)
+    return result
+
+
 # -----------------------------------------------------------------------------
 # Functions that add arithmetic methods to objects, given arithmetic factory
 # methods
@@ -1127,23 +1200,7 @@ def na_op(x, y):
             with np.errstate(invalid='ignore'):
                 result = op(x, y)
         except TypeError:
-            xrav = x.ravel()
-            result = np.empty(x.size, dtype=bool)
-            if isinstance(y, (np.ndarray, ABCSeries)):
-                yrav = y.ravel()
-                mask = notna(xrav) & notna(yrav)
-                result[mask] = op(np.array(list(xrav[mask])),
-                                  np.array(list(yrav[mask])))
-            else:
-                mask = notna(xrav)
-                result[mask] = op(np.array(list(xrav[mask])), y)
-
-            if op == operator.ne:  # pragma: no cover
-                np.putmask(result, ~mask, True)
-            else:
-                np.putmask(result, ~mask, False)
-            result = result.reshape(x.shape)
-
+            result = mask_cmp_op(x, y, op, (np.ndarray, ABCSeries))
         return result
 
     @Appender('Wrapper for flexible comparison methods {name}'
@@ -1221,23 +1278,7 @@ def na_op(x, y):
         try:
             result = expressions.evaluate(op, str_rep, x, y)
         except TypeError:
-            xrav = x.ravel()
-            result = np.empty(x.size, dtype=bool)
-            if isinstance(y, np.ndarray):
-                yrav = y.ravel()
-                mask = notna(xrav) & notna(yrav)
-                result[mask] = op(np.array(list(xrav[mask])),
-                                  np.array(list(yrav[mask])))
-            else:
-                mask = notna(xrav)
-                result[mask] = op(np.array(list(xrav[mask])), y)
-
-            if op == operator.ne:  # pragma: no cover
-                np.putmask(result, ~mask, True)
-            else:
-                np.putmask(result, ~mask, False)
-            result = result.reshape(x.shape)
-
+            result = mask_cmp_op(x, y, op, np.ndarray)
         return result
 
     @Appender('Wrapper for comparison method {name}'.format(name=name))
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1725,19 +1725,8 @@ def _binop(self, other, func, level=None, fill_value=None):
                                      copy=False)
             new_index = this.index
 
-        this_vals = this.values
-        other_vals = other.values
-
-        if fill_value is not None:
-            this_mask = isna(this_vals)
-            other_mask = isna(other_vals)
-            this_vals = this_vals.copy()
-            other_vals = other_vals.copy()
-
-            # one but not both
-            mask = this_mask ^ other_mask
-            this_vals[this_mask & mask] = fill_value
-            other_vals[other_mask & mask] = fill_value
+        this_vals, other_vals = ops.fill_binop(this.values, other.values,
+                                               fill_value)
 
         with np.errstate(all='ignore'):
             result = func(this_vals, other_vals)
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -183,7 +183,10 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
 
     Returns
     -------
-    a filepath_ or buffer or S3File instance, the encoding, the compression
+    tuple of ({a filepath_ or buffer or S3File instance},
+              encoding, str,
+              compression, str,
+              should_close, bool)
     """
     filepath_or_buffer = _stringify_path(filepath_or_buffer)
 
@@ -194,7 +197,8 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
             # Override compression based on Content-Encoding header
             compression = 'gzip'
         reader = BytesIO(req.read())
-        return reader, encoding, compression
+        req.close()
+        return reader, encoding, compression, True
 
     if is_s3_url(filepath_or_buffer):
         from pandas.io import s3
@@ -206,13 +210,13 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
     if isinstance(filepath_or_buffer, (compat.string_types,
                                        compat.binary_type,
                                        mmap.mmap)):
-        return _expand_user(filepath_or_buffer), None, compression
+        return _expand_user(filepath_or_buffer), None, compression, False
 
     if not is_file_like(filepath_or_buffer):
         msg = "Invalid file path or buffer object type: {_type}"
         raise ValueError(msg.format(_type=type(filepath_or_buffer)))
 
-    return filepath_or_buffer, None, compression
+    return filepath_or_buffer, None, compression, False
 
 
 def file_path_to_url(path):
@@ -309,6 +313,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
     is_text : boolean, default True
         whether file/buffer is in text format (csv, json, etc.), or in binary
         mode (pickle, etc.)
+
     Returns
     -------
     f : file-like
diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -381,7 +381,7 @@ def __init__(self, io, **kwds):
         if _is_url(self._io):
             io = _urlopen(self._io)
         elif not isinstance(self.io, (ExcelFile, xlrd.Book)):
-            io, _, _ = get_filepath_or_buffer(self._io)
+            io, _, _, _ = get_filepath_or_buffer(self._io)
 
         if engine == 'xlrd' and isinstance(io, xlrd.Book):
             self.book = io
diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
@@ -404,7 +404,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
     """
 
     compression = _infer_compression(path_or_buf, compression)
-    filepath_or_buffer, _, compression = get_filepath_or_buffer(
+    filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
         path_or_buf, encoding=encoding, compression=compression,
     )
 
@@ -419,7 +419,13 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
     if chunksize:
         return json_reader
 
-    return json_reader.read()
+    result = json_reader.read()
+    if should_close:
+        try:
+            filepath_or_buffer.close()
+        except:  # noqa: flake8
+            pass
+    return result
 
 
 class JsonReader(BaseIterator):
diff --git a/pandas/io/packers.py b/pandas/io/packers.py
@@ -180,14 +180,20 @@ def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs):
     obj : type of object stored in file
 
     """
-    path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf)
+    path_or_buf, _, _, should_close = get_filepath_or_buffer(path_or_buf)
     if iterator:
         return Iterator(path_or_buf)
 
     def read(fh):
         l = list(unpack(fh, encoding=encoding, **kwargs))
         if len(l) == 1:
             return l[0]
+
+        if should_close:
+            try:
+                path_or_buf.close()
+            except:  # noqa: flake8
+                pass
         return l
 
     # see if we have an actual file
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -107,7 +107,7 @@ def write(self, df, path, compression='snappy',
         self.validate_dataframe(df)
         if self._pyarrow_lt_070:
             self._validate_write_lt_070(df)
-        path, _, _ = get_filepath_or_buffer(path, mode='wb')
+        path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
 
         if self._pyarrow_lt_060:
             table = self.api.Table.from_pandas(df, timestamps_to_ms=True)
@@ -121,13 +121,21 @@ def write(self, df, path, compression='snappy',
                 coerce_timestamps=coerce_timestamps, **kwargs)
 
     def read(self, path, columns=None, **kwargs):
-        path, _, _ = get_filepath_or_buffer(path)
+        path, _, _, should_close = get_filepath_or_buffer(path)
         if self._pyarrow_lt_070:
-            return self.api.parquet.read_pandas(path, columns=columns,
-                                                **kwargs).to_pandas()
-        kwargs['use_pandas_metadata'] = True
-        return self.api.parquet.read_table(path, columns=columns,
-                                           **kwargs).to_pandas()
+            result = self.api.parquet.read_pandas(path, columns=columns,
+                                                  **kwargs).to_pandas()
+        else:
+            kwargs['use_pandas_metadata'] = True
+            result = self.api.parquet.read_table(path, columns=columns,
+                                                 **kwargs).to_pandas()
+        if should_close:
+            try:
+                path.close()
+            except:  # noqa: flake8
+                pass
+
+        return result
 
     def _validate_write_lt_070(self, df):
         # Compatibility shim for pyarrow < 0.7.0
@@ -199,11 +207,11 @@ def write(self, df, path, compression='snappy', **kwargs):
             # path is s3:// so we need to open the s3file in 'wb' mode.
             # TODO: Support 'ab'
 
-            path, _, _ = get_filepath_or_buffer(path, mode='wb')
+            path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
             # And pass the opened s3file to the fastparquet internal impl.
             kwargs['open_with'] = lambda path, _: path
         else:
-            path, _, _ = get_filepath_or_buffer(path)
+            path, _, _, _ = get_filepath_or_buffer(path)
 
         with catch_warnings(record=True):
             self.api.write(path, df,
@@ -214,13 +222,13 @@ def read(self, path, columns=None, **kwargs):
             # When path is s3:// an S3File is returned.
             # We need to retain the original path(str) while also
             # pass the S3File().open function to fsatparquet impl.
-            s3, _, _ = get_filepath_or_buffer(path)
+            s3, _, _, should_close = get_filepath_or_buffer(path)
             try:
                 parquet_file = self.api.ParquetFile(path, open_with=s3.s3.open)
             finally:
                 s3.close()
         else:
-            path, _, _ = get_filepath_or_buffer(path)
+            path, _, _, _ = get_filepath_or_buffer(path)
             parquet_file = self.api.ParquetFile(path)
 
         return parquet_file.to_pandas(columns=columns, **kwargs)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -413,7 +413,7 @@ def _read(filepath_or_buffer, kwds):
 
     compression = kwds.get('compression')
     compression = _infer_compression(filepath_or_buffer, compression)
-    filepath_or_buffer, _, compression = get_filepath_or_buffer(
+    filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
         filepath_or_buffer, encoding, compression)
     kwds['compression'] = compression
 
@@ -439,6 +439,13 @@ def _read(filepath_or_buffer, kwds):
         data = parser.read(nrows)
     finally:
         parser.close()
+
+    if should_close:
+        try:
+            filepath_or_buffer.close()
+        except:  # noqa: flake8
+            pass
+
     return data
 
 
diff --git a/pandas/io/s3.py b/pandas/io/s3.py
@@ -27,7 +27,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
     fs = s3fs.S3FileSystem(anon=False)
     try:
         filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode)
-    except (OSError, NoCredentialsError):
+    except (compat.FileNotFoundError, NoCredentialsError):
         # boto3 has troubles when trying to access a public file
         # when credentialed...
         # An OSError is raised if you have credentials, but they
@@ -36,4 +36,4 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
         # for that bucket.
         fs = s3fs.S3FileSystem(anon=True)
         filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode)
-    return filepath_or_buffer, None, compression
+    return filepath_or_buffer, None, compression, True
diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py
@@ -90,7 +90,7 @@ def __init__(self, path_or_buf, index=None, convert_dates=True,
         self._current_row_on_page_index = 0
         self._current_row_in_file_index = 0
 
-        self._path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf)
+        self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf)
         if isinstance(self._path_or_buf, compat.string_types):
             self._path_or_buf = open(self._path_or_buf, 'rb')
             self.handle = self._path_or_buf
diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py
@@ -236,7 +236,8 @@ def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1',
         self._chunksize = chunksize
 
         if isinstance(filepath_or_buffer, str):
-            filepath_or_buffer, encoding, compression = get_filepath_or_buffer(
+            (filepath_or_buffer, encoding,
+             compression, should_close) = get_filepath_or_buffer(
                 filepath_or_buffer, encoding=encoding)
 
         if isinstance(filepath_or_buffer, (str, compat.text_type, bytes)):
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py
diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py