Skip to content

Commit 6460364

Browse files
committed
Merge remote-tracking branch 'upstream/master' into gilbertoolimpio-DOCS---Ambiguous-description-in-pandas.DataFrame.to_parquet-documentation-pandas-dev#19662
2 parents 8414a1f + d9551c8 commit 6460364

17 files changed

+192
-117
lines changed

pandas/core/frame.py

+1-11
Original file line numberDiff line numberDiff line change
@@ -3944,17 +3944,7 @@ def _combine_frame(self, other, func, fill_value=None, level=None):
39443944
new_index, new_columns = this.index, this.columns
39453945

39463946
def _arith_op(left, right):
3947-
if fill_value is not None:
3948-
left_mask = isna(left)
3949-
right_mask = isna(right)
3950-
left = left.copy()
3951-
right = right.copy()
3952-
3953-
# one but not both
3954-
mask = left_mask ^ right_mask
3955-
left[left_mask & mask] = fill_value
3956-
right[right_mask & mask] = fill_value
3957-
3947+
left, right = ops.fill_binop(left, right, fill_value)
39583948
return func(left, right)
39593949

39603950
if this._is_mixed_type or other._is_mixed_type:

pandas/core/ops.py

+75-34
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,79 @@ def _make_flex_doc(op_name, typ):
398398
return doc
399399

400400

401+
# -----------------------------------------------------------------------------
402+
# Masking NA values and fallbacks for operations numpy does not support
403+
404+
def fill_binop(left, right, fill_value):
405+
"""
406+
If a non-None fill_value is given, replace null entries in left and right
407+
with this value, but only in positions where _one_ of left/right is null,
408+
not both.
409+
410+
Parameters
411+
----------
412+
left : array-like
413+
right : array-like
414+
fill_value : object
415+
416+
Returns
417+
-------
418+
left : array-like
419+
right : array-like
420+
421+
Notes
422+
-----
423+
Makes copies if fill_value is not None
424+
"""
425+
# TODO: can we make a no-copy implementation?
426+
if fill_value is not None:
427+
left_mask = isna(left)
428+
right_mask = isna(right)
429+
left = left.copy()
430+
right = right.copy()
431+
432+
# one but not both
433+
mask = left_mask ^ right_mask
434+
left[left_mask & mask] = fill_value
435+
right[right_mask & mask] = fill_value
436+
return left, right
437+
438+
439+
def mask_cmp_op(x, y, op, allowed_types):
440+
"""
441+
Apply the function `op` to only non-null points in x and y.
442+
443+
Parameters
444+
----------
445+
x : array-like
446+
y : array-like
447+
op : binary operation
448+
allowed_types : class or tuple of classes
449+
450+
Returns
451+
-------
452+
result : ndarray[bool]
453+
"""
454+
# TODO: Can we make the allowed_types arg unnecessary?
455+
xrav = x.ravel()
456+
result = np.empty(x.size, dtype=bool)
457+
if isinstance(y, allowed_types):
458+
yrav = y.ravel()
459+
mask = notna(xrav) & notna(yrav)
460+
result[mask] = op(np.array(list(xrav[mask])),
461+
np.array(list(yrav[mask])))
462+
else:
463+
mask = notna(xrav)
464+
result[mask] = op(np.array(list(xrav[mask])), y)
465+
466+
if op == operator.ne: # pragma: no cover
467+
np.putmask(result, ~mask, True)
468+
else:
469+
np.putmask(result, ~mask, False)
470+
result = result.reshape(x.shape)
471+
return result
472+
473+
401474
# -----------------------------------------------------------------------------
402475
# Functions that add arithmetic methods to objects, given arithmetic factory
403476
# methods
@@ -1127,23 +1200,7 @@ def na_op(x, y):
11271200
with np.errstate(invalid='ignore'):
11281201
result = op(x, y)
11291202
except TypeError:
1130-
xrav = x.ravel()
1131-
result = np.empty(x.size, dtype=bool)
1132-
if isinstance(y, (np.ndarray, ABCSeries)):
1133-
yrav = y.ravel()
1134-
mask = notna(xrav) & notna(yrav)
1135-
result[mask] = op(np.array(list(xrav[mask])),
1136-
np.array(list(yrav[mask])))
1137-
else:
1138-
mask = notna(xrav)
1139-
result[mask] = op(np.array(list(xrav[mask])), y)
1140-
1141-
if op == operator.ne: # pragma: no cover
1142-
np.putmask(result, ~mask, True)
1143-
else:
1144-
np.putmask(result, ~mask, False)
1145-
result = result.reshape(x.shape)
1146-
1203+
result = mask_cmp_op(x, y, op, (np.ndarray, ABCSeries))
11471204
return result
11481205

11491206
@Appender('Wrapper for flexible comparison methods {name}'
@@ -1221,23 +1278,7 @@ def na_op(x, y):
12211278
try:
12221279
result = expressions.evaluate(op, str_rep, x, y)
12231280
except TypeError:
1224-
xrav = x.ravel()
1225-
result = np.empty(x.size, dtype=bool)
1226-
if isinstance(y, np.ndarray):
1227-
yrav = y.ravel()
1228-
mask = notna(xrav) & notna(yrav)
1229-
result[mask] = op(np.array(list(xrav[mask])),
1230-
np.array(list(yrav[mask])))
1231-
else:
1232-
mask = notna(xrav)
1233-
result[mask] = op(np.array(list(xrav[mask])), y)
1234-
1235-
if op == operator.ne: # pragma: no cover
1236-
np.putmask(result, ~mask, True)
1237-
else:
1238-
np.putmask(result, ~mask, False)
1239-
result = result.reshape(x.shape)
1240-
1281+
result = mask_cmp_op(x, y, op, np.ndarray)
12411282
return result
12421283

12431284
@Appender('Wrapper for comparison method {name}'.format(name=name))

pandas/core/series.py

+2-13
Original file line numberDiff line numberDiff line change
@@ -1725,19 +1725,8 @@ def _binop(self, other, func, level=None, fill_value=None):
17251725
copy=False)
17261726
new_index = this.index
17271727

1728-
this_vals = this.values
1729-
other_vals = other.values
1730-
1731-
if fill_value is not None:
1732-
this_mask = isna(this_vals)
1733-
other_mask = isna(other_vals)
1734-
this_vals = this_vals.copy()
1735-
other_vals = other_vals.copy()
1736-
1737-
# one but not both
1738-
mask = this_mask ^ other_mask
1739-
this_vals[this_mask & mask] = fill_value
1740-
other_vals[other_mask & mask] = fill_value
1728+
this_vals, other_vals = ops.fill_binop(this.values, other.values,
1729+
fill_value)
17411730

17421731
with np.errstate(all='ignore'):
17431732
result = func(this_vals, other_vals)

pandas/io/common.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,10 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
183183
184184
Returns
185185
-------
186-
a filepath_ or buffer or S3File instance, the encoding, the compression
186+
tuple of ({a filepath_ or buffer or S3File instance},
187+
encoding, str,
188+
compression, str,
189+
should_close, bool)
187190
"""
188191
filepath_or_buffer = _stringify_path(filepath_or_buffer)
189192

@@ -194,7 +197,8 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
194197
# Override compression based on Content-Encoding header
195198
compression = 'gzip'
196199
reader = BytesIO(req.read())
197-
return reader, encoding, compression
200+
req.close()
201+
return reader, encoding, compression, True
198202

199203
if is_s3_url(filepath_or_buffer):
200204
from pandas.io import s3
@@ -206,13 +210,13 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
206210
if isinstance(filepath_or_buffer, (compat.string_types,
207211
compat.binary_type,
208212
mmap.mmap)):
209-
return _expand_user(filepath_or_buffer), None, compression
213+
return _expand_user(filepath_or_buffer), None, compression, False
210214

211215
if not is_file_like(filepath_or_buffer):
212216
msg = "Invalid file path or buffer object type: {_type}"
213217
raise ValueError(msg.format(_type=type(filepath_or_buffer)))
214218

215-
return filepath_or_buffer, None, compression
219+
return filepath_or_buffer, None, compression, False
216220

217221

218222
def file_path_to_url(path):
@@ -309,6 +313,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
309313
is_text : boolean, default True
310314
whether file/buffer is in text format (csv, json, etc.), or in binary
311315
mode (pickle, etc.)
316+
312317
Returns
313318
-------
314319
f : file-like

pandas/io/excel.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,7 @@ def __init__(self, io, **kwds):
381381
if _is_url(self._io):
382382
io = _urlopen(self._io)
383383
elif not isinstance(self.io, (ExcelFile, xlrd.Book)):
384-
io, _, _ = get_filepath_or_buffer(self._io)
384+
io, _, _, _ = get_filepath_or_buffer(self._io)
385385

386386
if engine == 'xlrd' and isinstance(io, xlrd.Book):
387387
self.book = io

pandas/io/json/json.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -404,7 +404,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
404404
"""
405405

406406
compression = _infer_compression(path_or_buf, compression)
407-
filepath_or_buffer, _, compression = get_filepath_or_buffer(
407+
filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
408408
path_or_buf, encoding=encoding, compression=compression,
409409
)
410410

@@ -419,7 +419,13 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
419419
if chunksize:
420420
return json_reader
421421

422-
return json_reader.read()
422+
result = json_reader.read()
423+
if should_close:
424+
try:
425+
filepath_or_buffer.close()
426+
except: # noqa: flake8
427+
pass
428+
return result
423429

424430

425431
class JsonReader(BaseIterator):

pandas/io/packers.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -180,14 +180,20 @@ def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs):
180180
obj : type of object stored in file
181181
182182
"""
183-
path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf)
183+
path_or_buf, _, _, should_close = get_filepath_or_buffer(path_or_buf)
184184
if iterator:
185185
return Iterator(path_or_buf)
186186

187187
def read(fh):
188188
l = list(unpack(fh, encoding=encoding, **kwargs))
189189
if len(l) == 1:
190190
return l[0]
191+
192+
if should_close:
193+
try:
194+
path_or_buf.close()
195+
except: # noqa: flake8
196+
pass
191197
return l
192198

193199
# see if we have an actual file

pandas/io/parquet.py

+19-11
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ def write(self, df, path, compression='snappy',
107107
self.validate_dataframe(df)
108108
if self._pyarrow_lt_070:
109109
self._validate_write_lt_070(df)
110-
path, _, _ = get_filepath_or_buffer(path, mode='wb')
110+
path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
111111

112112
if self._pyarrow_lt_060:
113113
table = self.api.Table.from_pandas(df, timestamps_to_ms=True)
@@ -121,13 +121,21 @@ def write(self, df, path, compression='snappy',
121121
coerce_timestamps=coerce_timestamps, **kwargs)
122122

123123
def read(self, path, columns=None, **kwargs):
124-
path, _, _ = get_filepath_or_buffer(path)
124+
path, _, _, should_close = get_filepath_or_buffer(path)
125125
if self._pyarrow_lt_070:
126-
return self.api.parquet.read_pandas(path, columns=columns,
127-
**kwargs).to_pandas()
128-
kwargs['use_pandas_metadata'] = True
129-
return self.api.parquet.read_table(path, columns=columns,
130-
**kwargs).to_pandas()
126+
result = self.api.parquet.read_pandas(path, columns=columns,
127+
**kwargs).to_pandas()
128+
else:
129+
kwargs['use_pandas_metadata'] = True
130+
result = self.api.parquet.read_table(path, columns=columns,
131+
**kwargs).to_pandas()
132+
if should_close:
133+
try:
134+
path.close()
135+
except: # noqa: flake8
136+
pass
137+
138+
return result
131139

132140
def _validate_write_lt_070(self, df):
133141
# Compatibility shim for pyarrow < 0.7.0
@@ -199,11 +207,11 @@ def write(self, df, path, compression='snappy', **kwargs):
199207
# path is s3:// so we need to open the s3file in 'wb' mode.
200208
# TODO: Support 'ab'
201209

202-
path, _, _ = get_filepath_or_buffer(path, mode='wb')
210+
path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
203211
# And pass the opened s3file to the fastparquet internal impl.
204212
kwargs['open_with'] = lambda path, _: path
205213
else:
206-
path, _, _ = get_filepath_or_buffer(path)
214+
path, _, _, _ = get_filepath_or_buffer(path)
207215

208216
with catch_warnings(record=True):
209217
self.api.write(path, df,
@@ -214,13 +222,13 @@ def read(self, path, columns=None, **kwargs):
214222
# When path is s3:// an S3File is returned.
215223
# We need to retain the original path(str) while also
216224
# pass the S3File().open function to fsatparquet impl.
217-
s3, _, _ = get_filepath_or_buffer(path)
225+
s3, _, _, should_close = get_filepath_or_buffer(path)
218226
try:
219227
parquet_file = self.api.ParquetFile(path, open_with=s3.s3.open)
220228
finally:
221229
s3.close()
222230
else:
223-
path, _, _ = get_filepath_or_buffer(path)
231+
path, _, _, _ = get_filepath_or_buffer(path)
224232
parquet_file = self.api.ParquetFile(path)
225233

226234
return parquet_file.to_pandas(columns=columns, **kwargs)

pandas/io/parsers.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -413,7 +413,7 @@ def _read(filepath_or_buffer, kwds):
413413

414414
compression = kwds.get('compression')
415415
compression = _infer_compression(filepath_or_buffer, compression)
416-
filepath_or_buffer, _, compression = get_filepath_or_buffer(
416+
filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
417417
filepath_or_buffer, encoding, compression)
418418
kwds['compression'] = compression
419419

@@ -439,6 +439,13 @@ def _read(filepath_or_buffer, kwds):
439439
data = parser.read(nrows)
440440
finally:
441441
parser.close()
442+
443+
if should_close:
444+
try:
445+
filepath_or_buffer.close()
446+
except: # noqa: flake8
447+
pass
448+
442449
return data
443450

444451

pandas/io/s3.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
2727
fs = s3fs.S3FileSystem(anon=False)
2828
try:
2929
filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode)
30-
except (OSError, NoCredentialsError):
30+
except (compat.FileNotFoundError, NoCredentialsError):
3131
# boto3 has troubles when trying to access a public file
3232
# when credentialed...
3333
# An OSError is raised if you have credentials, but they
@@ -36,4 +36,4 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
3636
# for that bucket.
3737
fs = s3fs.S3FileSystem(anon=True)
3838
filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode)
39-
return filepath_or_buffer, None, compression
39+
return filepath_or_buffer, None, compression, True

pandas/io/sas/sas7bdat.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def __init__(self, path_or_buf, index=None, convert_dates=True,
9090
self._current_row_on_page_index = 0
9191
self._current_row_in_file_index = 0
9292

93-
self._path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf)
93+
self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf)
9494
if isinstance(self._path_or_buf, compat.string_types):
9595
self._path_or_buf = open(self._path_or_buf, 'rb')
9696
self.handle = self._path_or_buf

pandas/io/sas/sas_xport.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,8 @@ def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1',
236236
self._chunksize = chunksize
237237

238238
if isinstance(filepath_or_buffer, str):
239-
filepath_or_buffer, encoding, compression = get_filepath_or_buffer(
239+
(filepath_or_buffer, encoding,
240+
compression, should_close) = get_filepath_or_buffer(
240241
filepath_or_buffer, encoding=encoding)
241242

242243
if isinstance(filepath_or_buffer, (str, compat.text_type, bytes)):

0 commit comments

Comments
 (0)