Skip to content

Commit 61f2579

Browse files
committed
Merge remote-tracking branch 'upstream/master' into unify_update
2 parents e4b734b + 2d4dd50 commit 61f2579

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+558
-301
lines changed

doc/source/io.rst

+5
Original file line numberDiff line numberDiff line change
@@ -2854,6 +2854,11 @@ It is often the case that users will insert columns to do temporary computations
28542854
in Excel and you may not want to read in those columns. ``read_excel`` takes
28552855
a ``usecols`` keyword to allow you to specify a subset of columns to parse.
28562856

2857+
.. deprecated:: 0.24.0
2858+
2859+
Passing in an integer for ``usecols`` has been deprecated. Please pass in a list
2860+
of ints from 0 to ``usecols`` inclusive instead.
2861+
28572862
If ``usecols`` is an integer, then it is assumed to indicate the last column
28582863
to be parsed.
28592864

doc/source/whatsnew/v0.18.1.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -266,7 +266,7 @@ These changes conform sparse handling to return the correct types and work to ma
266266

267267
``SparseArray.take`` now returns a scalar for scalar input, ``SparseArray`` for others. Furthermore, it handles a negative indexer with the same rule as ``Index`` (:issue:`10560`, :issue:`12796`)
268268

269-
.. ipython:: python
269+
.. code-block:: python
270270

271271
s = pd.SparseArray([np.nan, np.nan, 1, 2, 3, np.nan, 4, 5, np.nan, 6])
272272
s.take(0)

doc/source/whatsnew/v0.24.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -974,6 +974,7 @@ Deprecations
974974
- The methods :meth:`DataFrame.update` and :meth:`Panel.update` have deprecated the ``raise_conflict=False|True`` keyword in favor of ``errors='ignore'|'raise'`` (:issue:`23585`)
975975
- Deprecated the `nthreads` keyword of :func:`pandas.read_feather` in favor of
976976
`use_threads` to reflect the changes in pyarrow 0.11.0. (:issue:`23053`)
977+
- :func:`pandas.read_excel` has deprecated accepting ``usecols`` as an integer. Please pass in a list of ints from 0 to ``usecols`` inclusive instead (:issue:`23527`)
977978
- Constructing a :class:`TimedeltaIndex` from data with ``datetime64``-dtyped data is deprecated, will raise ``TypeError`` in a future version (:issue:`23539`)
978979

979980
.. _whatsnew_0240.deprecations.datetimelike_int_ops:
@@ -1302,6 +1303,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
13021303
- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
13031304
- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`)
13041305
- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`)
1306+
- Bug in :func:`read_csv()` in which memory management was prematurely optimized for the C engine when the data was being read in chunks (:issue:`23509`)
13051307
- :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`)
13061308
- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`)
13071309
- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`)

pandas/_libs/parsers.pyx

+1
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ cdef extern from "parser/tokenizer.h":
132132
int64_t *word_starts # where we are in the stream
133133
int64_t words_len
134134
int64_t words_cap
135+
int64_t max_words_cap # maximum word cap encountered
135136

136137
char *pword_start # pointer to stream start of current field
137138
int64_t word_start # position start of current field

pandas/_libs/src/parser/tokenizer.c

+31-2
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,7 @@ int parser_init(parser_t *self) {
197197
sz = sz ? sz : 1;
198198
self->words = (char **)malloc(sz * sizeof(char *));
199199
self->word_starts = (int64_t *)malloc(sz * sizeof(int64_t));
200+
self->max_words_cap = sz;
200201
self->words_cap = sz;
201202
self->words_len = 0;
202203

@@ -247,7 +248,7 @@ void parser_del(parser_t *self) {
247248
}
248249

249250
static int make_stream_space(parser_t *self, size_t nbytes) {
250-
int64_t i, cap;
251+
int64_t i, cap, length;
251252
int status;
252253
void *orig_ptr, *newptr;
253254

@@ -287,8 +288,23 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
287288
*/
288289

289290
cap = self->words_cap;
291+
292+
/**
293+
* If we are reading in chunks, we need to be aware of the maximum number
294+
* of words we have seen in previous chunks (self->max_words_cap), so
295+
* that way, we can properly allocate when reading subsequent ones.
296+
*
297+
* Otherwise, we risk a buffer overflow if we mistakenly under-allocate
298+
* just because a recent chunk did not have as many words.
299+
*/
300+
if (self->words_len + nbytes < self->max_words_cap) {
301+
length = self->max_words_cap - nbytes;
302+
} else {
303+
length = self->words_len;
304+
}
305+
290306
self->words =
291-
(char **)grow_buffer((void *)self->words, self->words_len,
307+
(char **)grow_buffer((void *)self->words, length,
292308
(int64_t*)&self->words_cap, nbytes,
293309
sizeof(char *), &status);
294310
TRACE(
@@ -1241,6 +1257,19 @@ int parser_trim_buffers(parser_t *self) {
12411257

12421258
int64_t i;
12431259

1260+
/**
1261+
* Before we free up space and trim, we should
1262+
* save how many words we saw when parsing, if
1263+
* it exceeds the maximum number we saw before.
1264+
*
1265+
* This is important for when we read in chunks,
1266+
* so that we can inform subsequent chunk parsing
1267+
* as to how many words we could possibly see.
1268+
*/
1269+
if (self->words_cap > self->max_words_cap) {
1270+
self->max_words_cap = self->words_cap;
1271+
}
1272+
12441273
/* trim words, word_starts */
12451274
new_cap = _next_pow2(self->words_len) + 1;
12461275
if (new_cap < self->words_cap) {

pandas/_libs/src/parser/tokenizer.h

+1
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ typedef struct parser_t {
142142
int64_t *word_starts; // where we are in the stream
143143
int64_t words_len;
144144
int64_t words_cap;
145+
int64_t max_words_cap; // maximum word cap encountered
145146

146147
char *pword_start; // pointer to stream start of current field
147148
int64_t word_start; // position start of current field

pandas/core/arrays/datetimelike.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -124,8 +124,12 @@ def asi8(self):
124124
# do not cache or you'll create a memory leak
125125
return self._data.view('i8')
126126

127-
# ------------------------------------------------------------------
128-
# Array-like Methods
127+
# ----------------------------------------------------------------
128+
# Array-Like / EA-Interface Methods
129+
130+
@property
131+
def nbytes(self):
132+
return self._data.nbytes
129133

130134
@property
131135
def shape(self):

pandas/core/arrays/datetimes.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -385,7 +385,7 @@ def _resolution(self):
385385
return libresolution.resolution(self.asi8, self.tz)
386386

387387
# ----------------------------------------------------------------
388-
# Array-like Methods
388+
# Array-Like / EA-Interface Methods
389389

390390
def __array__(self, dtype=None):
391391
if is_object_dtype(dtype):

pandas/core/arrays/period.py

+52-54
Original file line numberDiff line numberDiff line change
@@ -272,10 +272,6 @@ def _concat_same_type(cls, to_concat):
272272

273273
# --------------------------------------------------------------------
274274
# Data / Attributes
275-
@property
276-
def nbytes(self):
277-
# TODO(DatetimeArray): remove
278-
return self._data.nbytes
279275

280276
@cache_readonly
281277
def dtype(self):
@@ -286,10 +282,6 @@ def _ndarray_values(self):
286282
# Ordinals
287283
return self._data
288284

289-
@property
290-
def asi8(self):
291-
return self._data
292-
293285
@property
294286
def freq(self):
295287
"""Return the frequency object for this PeriodArray."""
@@ -330,6 +322,50 @@ def start_time(self):
330322
def end_time(self):
331323
return self.to_timestamp(how='end')
332324

325+
def to_timestamp(self, freq=None, how='start'):
326+
"""
327+
Cast to DatetimeArray/Index.
328+
329+
Parameters
330+
----------
331+
freq : string or DateOffset, optional
332+
Target frequency. The default is 'D' for week or longer,
333+
'S' otherwise
334+
how : {'s', 'e', 'start', 'end'}
335+
336+
Returns
337+
-------
338+
DatetimeArray/Index
339+
"""
340+
from pandas.core.arrays import DatetimeArrayMixin
341+
342+
how = libperiod._validate_end_alias(how)
343+
344+
end = how == 'E'
345+
if end:
346+
if freq == 'B':
347+
# roll forward to ensure we land on B date
348+
adjust = Timedelta(1, 'D') - Timedelta(1, 'ns')
349+
return self.to_timestamp(how='start') + adjust
350+
else:
351+
adjust = Timedelta(1, 'ns')
352+
return (self + self.freq).to_timestamp(how='start') - adjust
353+
354+
if freq is None:
355+
base, mult = frequencies.get_freq_code(self.freq)
356+
freq = frequencies.get_to_timestamp_base(base)
357+
else:
358+
freq = Period._maybe_convert_freq(freq)
359+
360+
base, mult = frequencies.get_freq_code(freq)
361+
new_data = self.asfreq(freq, how=how)
362+
363+
new_data = libperiod.periodarr_to_dt64arr(new_data.asi8, base)
364+
return DatetimeArrayMixin(new_data, freq='infer')
365+
366+
# --------------------------------------------------------------------
367+
# Array-like / EA-Interface Methods
368+
333369
def __repr__(self):
334370
return '<{}>\n{}\nLength: {}, dtype: {}'.format(
335371
self.__class__.__name__,
@@ -456,6 +492,8 @@ def value_counts(self, dropna=False):
456492
name=result.index.name)
457493
return Series(result.values, index=index, name=result.name)
458494

495+
# --------------------------------------------------------------------
496+
459497
def shift(self, periods=1):
460498
"""
461499
Shift values by desired number.
@@ -567,49 +605,9 @@ def asfreq(self, freq=None, how='E'):
567605

568606
return type(self)(new_data, freq=freq)
569607

570-
def to_timestamp(self, freq=None, how='start'):
571-
"""
572-
Cast to DatetimeArray/Index
573-
574-
Parameters
575-
----------
576-
freq : string or DateOffset, optional
577-
Target frequency. The default is 'D' for week or longer,
578-
'S' otherwise
579-
how : {'s', 'e', 'start', 'end'}
580-
581-
Returns
582-
-------
583-
DatetimeArray/Index
584-
"""
585-
from pandas.core.arrays import DatetimeArrayMixin
586-
587-
how = libperiod._validate_end_alias(how)
588-
589-
end = how == 'E'
590-
if end:
591-
if freq == 'B':
592-
# roll forward to ensure we land on B date
593-
adjust = Timedelta(1, 'D') - Timedelta(1, 'ns')
594-
return self.to_timestamp(how='start') + adjust
595-
else:
596-
adjust = Timedelta(1, 'ns')
597-
return (self + self.freq).to_timestamp(how='start') - adjust
598-
599-
if freq is None:
600-
base, mult = frequencies.get_freq_code(self.freq)
601-
freq = frequencies.get_to_timestamp_base(base)
602-
else:
603-
freq = Period._maybe_convert_freq(freq)
604-
605-
base, mult = frequencies.get_freq_code(freq)
606-
new_data = self.asfreq(freq, how=how)
607-
608-
new_data = libperiod.periodarr_to_dt64arr(new_data.asi8, base)
609-
return DatetimeArrayMixin(new_data, freq='infer')
610-
611608
# ------------------------------------------------------------------
612609
# Formatting
610+
613611
def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs):
614612
""" actually format my specific types """
615613
# TODO(DatetimeArray): remove
@@ -630,9 +628,13 @@ def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs):
630628
values = np.array([formatter(dt) for dt in values])
631629
return values
632630

631+
# Delegation...
632+
def strftime(self, date_format):
633+
return self._format_native_types(date_format=date_format)
634+
633635
def repeat(self, repeats, *args, **kwargs):
634636
"""
635-
Repeat elements of a Categorical.
637+
Repeat elements of a PeriodArray.
636638
637639
See also
638640
--------
@@ -643,10 +645,6 @@ def repeat(self, repeats, *args, **kwargs):
643645
values = self._data.repeat(repeats)
644646
return type(self)(values, self.freq)
645647

646-
# Delegation...
647-
def strftime(self, date_format):
648-
return self._format_native_types(date_format=date_format)
649-
650648
def astype(self, dtype, copy=True):
651649
# TODO: Figure out something better here...
652650
# We have DatetimeLikeArrayMixin ->

0 commit comments

Comments
 (0)