Skip to content

Commit 5621418

Browse files
committed
Merge branch 'master' of https://github.com/pandas-dev/pandas into ref-cat-astype-2
2 parents c78aa77 + d4b6233 commit 5621418

File tree

7 files changed

+144
-45
lines changed

7 files changed

+144
-45
lines changed

asv_bench/benchmarks/io/csv.py

+63-32
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from io import StringIO
1+
from io import BytesIO, StringIO
22
import random
33
import string
44

@@ -146,10 +146,10 @@ def time_read_csv(self, bad_date_value):
146146
class ReadCSVSkipRows(BaseIO):
147147

148148
fname = "__test__.csv"
149-
params = [None, 10000]
150-
param_names = ["skiprows"]
149+
params = ([None, 10000], ["c", "python"])
150+
param_names = ["skiprows", "engine"]
151151

152-
def setup(self, skiprows):
152+
def setup(self, skiprows, engine):
153153
N = 20000
154154
index = tm.makeStringIndex(N)
155155
df = DataFrame(
@@ -164,8 +164,8 @@ def setup(self, skiprows):
164164
)
165165
df.to_csv(self.fname)
166166

167-
def time_skipprows(self, skiprows):
168-
read_csv(self.fname, skiprows=skiprows)
167+
def time_skipprows(self, skiprows, engine):
168+
read_csv(self.fname, skiprows=skiprows, engine=engine)
169169

170170

171171
class ReadUint64Integers(StringIORewind):
@@ -192,10 +192,10 @@ def time_read_uint64_na_values(self):
192192
class ReadCSVThousands(BaseIO):
193193

194194
fname = "__test__.csv"
195-
params = ([",", "|"], [None, ","])
196-
param_names = ["sep", "thousands"]
195+
params = ([",", "|"], [None, ","], ["c", "python"])
196+
param_names = ["sep", "thousands", "engine"]
197197

198-
def setup(self, sep, thousands):
198+
def setup(self, sep, thousands, engine):
199199
N = 10000
200200
K = 8
201201
data = np.random.randn(N, K) * np.random.randint(100, 10000, (N, K))
@@ -206,16 +206,19 @@ def setup(self, sep, thousands):
206206
df = df.applymap(lambda x: fmt.format(x))
207207
df.to_csv(self.fname, sep=sep)
208208

209-
def time_thousands(self, sep, thousands):
210-
read_csv(self.fname, sep=sep, thousands=thousands)
209+
def time_thousands(self, sep, thousands, engine):
210+
read_csv(self.fname, sep=sep, thousands=thousands, engine=engine)
211211

212212

213213
class ReadCSVComment(StringIORewind):
214-
def setup(self):
214+
params = ["c", "python"]
215+
param_names = ["engine"]
216+
217+
def setup(self, engine):
215218
data = ["A,B,C"] + (["1,2,3 # comment"] * 100000)
216219
self.StringIO_input = StringIO("\n".join(data))
217220

218-
def time_comment(self):
221+
def time_comment(self, engine):
219222
read_csv(
220223
self.data(self.StringIO_input), comment="#", header=None, names=list("abc")
221224
)
@@ -255,25 +258,47 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision):
255258
)
256259

257260

261+
class ReadCSVEngine(StringIORewind):
262+
params = ["c", "python"]
263+
param_names = ["engine"]
264+
265+
def setup(self, engine):
266+
data = ["A,B,C,D,E"] + (["1,2,3,4,5"] * 100000)
267+
self.StringIO_input = StringIO("\n".join(data))
268+
# simulate reading from file
269+
self.BytesIO_input = BytesIO(self.StringIO_input.read().encode("utf-8"))
270+
271+
def time_read_stringcsv(self, engine):
272+
read_csv(self.data(self.StringIO_input), engine=engine)
273+
274+
def time_read_bytescsv(self, engine):
275+
read_csv(self.data(self.BytesIO_input), engine=engine)
276+
277+
258278
class ReadCSVCategorical(BaseIO):
259279

260280
fname = "__test__.csv"
281+
params = ["c", "python"]
282+
param_names = ["engine"]
261283

262-
def setup(self):
284+
def setup(self, engine):
263285
N = 100000
264286
group1 = ["aaaaaaaa", "bbbbbbb", "cccccccc", "dddddddd", "eeeeeeee"]
265287
df = DataFrame(np.random.choice(group1, (N, 3)), columns=list("abc"))
266288
df.to_csv(self.fname, index=False)
267289

268-
def time_convert_post(self):
269-
read_csv(self.fname).apply(Categorical)
290+
def time_convert_post(self, engine):
291+
read_csv(self.fname, engine=engine).apply(Categorical)
270292

271-
def time_convert_direct(self):
272-
read_csv(self.fname, dtype="category")
293+
def time_convert_direct(self, engine):
294+
read_csv(self.fname, engine=engine, dtype="category")
273295

274296

275297
class ReadCSVParseDates(StringIORewind):
276-
def setup(self):
298+
params = ["c", "python"]
299+
param_names = ["engine"]
300+
301+
def setup(self, engine):
277302
data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n
278303
{},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n
279304
{},21:00:00,20:56:00,-0.5900,2.2100,5.7000,0.0000,280.0000\n
@@ -284,18 +309,20 @@ def setup(self):
284309
data = data.format(*two_cols)
285310
self.StringIO_input = StringIO(data)
286311

287-
def time_multiple_date(self):
312+
def time_multiple_date(self, engine):
288313
read_csv(
289314
self.data(self.StringIO_input),
315+
engine=engine,
290316
sep=",",
291317
header=None,
292318
names=list(string.digits[:9]),
293319
parse_dates=[[1, 2], [1, 3]],
294320
)
295321

296-
def time_baseline(self):
322+
def time_baseline(self, engine):
297323
read_csv(
298324
self.data(self.StringIO_input),
325+
engine=engine,
299326
sep=",",
300327
header=None,
301328
parse_dates=[1],
@@ -304,17 +331,18 @@ def time_baseline(self):
304331

305332

306333
class ReadCSVCachedParseDates(StringIORewind):
307-
params = ([True, False],)
308-
param_names = ["do_cache"]
334+
params = ([True, False], ["c", "python"])
335+
param_names = ["do_cache", "engine"]
309336

310-
def setup(self, do_cache):
337+
def setup(self, do_cache, engine):
311338
data = ("\n".join(f"10/{year}" for year in range(2000, 2100)) + "\n") * 10
312339
self.StringIO_input = StringIO(data)
313340

314-
def time_read_csv_cached(self, do_cache):
341+
def time_read_csv_cached(self, do_cache, engine):
315342
try:
316343
read_csv(
317344
self.data(self.StringIO_input),
345+
engine=engine,
318346
header=None,
319347
parse_dates=[0],
320348
cache_dates=do_cache,
@@ -329,37 +357,40 @@ class ReadCSVMemoryGrowth(BaseIO):
329357
chunksize = 20
330358
num_rows = 1000
331359
fname = "__test__.csv"
360+
params = ["c", "python"]
361+
param_names = ["engine"]
332362

333-
def setup(self):
363+
def setup(self, engine):
334364
with open(self.fname, "w") as f:
335365
for i in range(self.num_rows):
336366
f.write(f"{i}\n")
337367

338-
def mem_parser_chunks(self):
368+
def mem_parser_chunks(self, engine):
339369
# see gh-24805.
340-
result = read_csv(self.fname, chunksize=self.chunksize)
370+
result = read_csv(self.fname, chunksize=self.chunksize, engine=engine)
341371

342372
for _ in result:
343373
pass
344374

345375

346376
class ReadCSVParseSpecialDate(StringIORewind):
347-
params = (["mY", "mdY", "hm"],)
348-
param_names = ["value"]
377+
params = (["mY", "mdY", "hm"], ["c", "python"])
378+
param_names = ["value", "engine"]
349379
objects = {
350380
"mY": "01-2019\n10-2019\n02/2000\n",
351381
"mdY": "12/02/2010\n",
352382
"hm": "21:34\n",
353383
}
354384

355-
def setup(self, value):
385+
def setup(self, value, engine):
356386
count_elem = 10000
357387
data = self.objects[value] * count_elem
358388
self.StringIO_input = StringIO(data)
359389

360-
def time_read_special_date(self, value):
390+
def time_read_special_date(self, value, engine):
361391
read_csv(
362392
self.data(self.StringIO_input),
393+
engine=engine,
363394
sep=",",
364395
header=None,
365396
names=["Date"],

ci/code_checks.sh

+4
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,10 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
178178
pytest -q --doctest-modules pandas/core/strings/
179179
RET=$(($RET + $?)) ; echo $MSG "DONE"
180180

181+
MSG='Doctests sql.py' ; echo $MSG
182+
pytest -q --doctest-modules pandas/io/sql.py
183+
RET=$(($RET + $?)) ; echo $MSG "DONE"
184+
181185
# Directories
182186

183187
MSG='Doctests arrays'; echo $MSG

doc/source/whatsnew/v1.3.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ MultiIndex
232232
^^^^^^^^^^
233233

234234
- Bug in :meth:`DataFrame.drop` raising ``TypeError`` when :class:`MultiIndex` is non-unique and no level is provided (:issue:`36293`)
235-
-
235+
- Bug in :meth:`MultiIndex.equals` incorrectly returning ``True`` when :class:`MultiIndex` containing ``NaN`` even when they are differntly ordered (:issue:`38439`)
236236

237237
I/O
238238
^^^

pandas/core/indexes/category.py

+1-9
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from pandas._libs import index as libindex
99
from pandas._libs.lib import no_default
1010
from pandas._typing import ArrayLike, Label
11-
from pandas.util._decorators import Appender, cache_readonly, doc
11+
from pandas.util._decorators import Appender, doc
1212

1313
from pandas.core.dtypes.common import (
1414
ensure_platform_int,
@@ -381,14 +381,6 @@ def fillna(self, value, downcast=None):
381381
cat = self._data.fillna(value)
382382
return type(self)._simple_new(cat, name=self.name)
383383

384-
@cache_readonly
385-
def _engine(self):
386-
# we are going to look things up with the codes themselves.
387-
# To avoid a reference cycle, bind `codes` to a local variable, so
388-
# `self` is not passed into the lambda.
389-
codes = self.codes
390-
return self._engine_type(lambda: codes, len(self))
391-
392384
@doc(Index.unique)
393385
def unique(self, level=None):
394386
if level is not None:

pandas/core/indexes/multi.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -3454,13 +3454,17 @@ def equals(self, other: object) -> bool:
34543454

34553455
for i in range(self.nlevels):
34563456
self_codes = self.codes[i]
3457-
self_codes = self_codes[self_codes != -1]
3457+
other_codes = other.codes[i]
3458+
self_mask = self_codes == -1
3459+
other_mask = other_codes == -1
3460+
if not np.array_equal(self_mask, other_mask):
3461+
return False
3462+
self_codes = self_codes[~self_mask]
34583463
self_values = algos.take_nd(
34593464
np.asarray(self.levels[i]._values), self_codes, allow_fill=False
34603465
)
34613466

3462-
other_codes = other.codes[i]
3463-
other_codes = other_codes[other_codes != -1]
3467+
other_codes = other_codes[~other_mask]
34643468
other_values = algos.take_nd(
34653469
np.asarray(other.levels[i]._values), other_codes, allow_fill=False
34663470
)

pandas/io/sql.py

+58
Original file line numberDiff line numberDiff line change
@@ -482,6 +482,64 @@ def read_sql(
482482
--------
483483
read_sql_table : Read SQL database table into a DataFrame.
484484
read_sql_query : Read SQL query into a DataFrame.
485+
486+
Examples
487+
--------
488+
Read data from SQL via either a SQL query or a SQL tablename.
489+
When using a SQLite database only SQL queries are accepted,
490+
providing only the SQL tablename will result in an error.
491+
492+
>>> from sqlite3 import connect
493+
>>> conn = connect(':memory:')
494+
>>> df = pd.DataFrame(data=[[0, '10/11/12'], [1, '12/11/10']],
495+
... columns=['int_column', 'date_column'])
496+
>>> df.to_sql('test_data', conn)
497+
498+
>>> pd.read_sql('SELECT int_column, date_column FROM test_data', conn)
499+
int_column date_column
500+
0 0 10/11/12
501+
1 1 12/11/10
502+
503+
>>> pd.read_sql('test_data', 'postgres:///db_name') # doctest:+SKIP
504+
505+
Apply date parsing to columns through the ``parse_dates`` argument
506+
507+
>>> pd.read_sql('SELECT int_column, date_column FROM test_data',
508+
... conn,
509+
... parse_dates=["date_column"])
510+
int_column date_column
511+
0 0 2012-10-11
512+
1 1 2010-12-11
513+
514+
The ``parse_dates`` argument calls ``pd.to_datetime`` on the provided columns.
515+
Custom argument values for applying ``pd.to_datetime`` on a column are specified
516+
via a dictionary format:
517+
1. Ignore errors while parsing the values of "date_column"
518+
519+
>>> pd.read_sql('SELECT int_column, date_column FROM test_data',
520+
... conn,
521+
... parse_dates={"date_column": {"errors": "ignore"}})
522+
int_column date_column
523+
0 0 2012-10-11
524+
1 1 2010-12-11
525+
526+
2. Apply a dayfirst date parsing order on the values of "date_column"
527+
528+
>>> pd.read_sql('SELECT int_column, date_column FROM test_data',
529+
... conn,
530+
... parse_dates={"date_column": {"dayfirst": True}})
531+
int_column date_column
532+
0 0 2012-11-10
533+
1 1 2010-11-12
534+
535+
3. Apply custom formatting when date parsing the values of "date_column"
536+
537+
>>> pd.read_sql('SELECT int_column, date_column FROM test_data',
538+
... conn,
539+
... parse_dates={"date_column": {"format": "%d/%m/%y"}})
540+
int_column date_column
541+
0 0 2012-11-10
542+
1 1 2010-11-12
485543
"""
486544
pandas_sql = pandasSQL_builder(con)
487545

pandas/tests/indexes/multi/test_equivalence.py

+10
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,16 @@ def test_equals_missing_values():
209209
assert not result
210210

211211

212+
def test_equals_missing_values_differently_sorted():
213+
# GH#38439
214+
mi1 = pd.MultiIndex.from_tuples([(81.0, np.nan), (np.nan, np.nan)])
215+
mi2 = pd.MultiIndex.from_tuples([(np.nan, np.nan), (81.0, np.nan)])
216+
assert not mi1.equals(mi2)
217+
218+
mi2 = pd.MultiIndex.from_tuples([(81.0, np.nan), (np.nan, np.nan)])
219+
assert mi1.equals(mi2)
220+
221+
212222
def test_is_():
213223
mi = MultiIndex.from_tuples(zip(range(10), range(10)))
214224
assert mi.is_(mi)

0 commit comments

Comments
 (0)