Skip to content

Commit ffd90a7

Browse files
authored
Merge branch 'master' into move-metadata-to-cfg
2 parents 24c4e29 + 125441c commit ffd90a7

File tree

161 files changed

+7382
-6894
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

161 files changed

+7382
-6894
lines changed

.github/PULL_REQUEST_TEMPLATE.md

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
- [ ] closes #xxxx
22
- [ ] tests added / passed
3-
- [ ] passes `black pandas`
4-
- [ ] passes `git diff upstream/master -u -- "*.py" | flake8 --diff`
3+
- [ ] Ensure all linting tests pass, see [here](https://pandas.pydata.org/pandas-docs/dev/development/contributing.html#code-standards) for how to run them
54
- [ ] whatsnew entry

.pre-commit-config.yaml

+10-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
minimum_pre_commit_version: 2.9.2
2+
exclude: ^LICENSES/|\.(html|csv|svg)$
23
repos:
34
- repo: https://github.com/python/black
45
rev: 20.8b1
@@ -121,6 +122,13 @@ repos:
121122
entry: python scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module"
122123
types: [python]
123124
exclude: ^(asv_bench|pandas/tests|doc)/
125+
- id: unwanted-patterns-bare-pytest-raises
126+
name: Check for use of bare pytest raises
127+
language: python
128+
entry: python scripts/validate_unwanted_patterns.py --validation-type="bare_pytest_raises"
129+
types: [python]
130+
files: ^pandas/tests/
131+
exclude: ^pandas/tests/(computation|extension|io)/
124132
- id: inconsistent-namespace-usage
125133
name: 'Check for inconsistent use of pandas namespace in tests'
126134
entry: python scripts/check_for_inconsistent_pandas_namespace.py
@@ -137,7 +145,7 @@ repos:
137145
name: Check for use of foo.__class__ instead of type(foo)
138146
entry: \.__class__
139147
language: pygrep
140-
files: \.(py|pyx)$
148+
types_or: [python, cython]
141149
- id: unwanted-typing
142150
name: Check for use of comment-based annotation syntax and missing error codes
143151
entry: |
@@ -165,9 +173,8 @@ repos:
165173
rev: v3.4.0
166174
hooks:
167175
- id: end-of-file-fixer
168-
exclude: ^LICENSES/|\.(html|csv|txt|svg|py)$
176+
exclude: \.txt$
169177
- id: trailing-whitespace
170-
exclude: \.(html|svg)$
171178
- repo: https://github.com/codespell-project/codespell
172179
rev: v2.0.0
173180
hooks:

asv_bench/benchmarks/arithmetic.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -122,18 +122,18 @@ def setup(self, op):
122122
n_rows = 500
123123

124124
# construct dataframe with 2 blocks
125-
arr1 = np.random.randn(n_rows, int(n_cols / 2)).astype("f8")
126-
arr2 = np.random.randn(n_rows, int(n_cols / 2)).astype("f4")
125+
arr1 = np.random.randn(n_rows, n_cols // 2).astype("f8")
126+
arr2 = np.random.randn(n_rows, n_cols // 2).astype("f4")
127127
df = pd.concat(
128128
[pd.DataFrame(arr1), pd.DataFrame(arr2)], axis=1, ignore_index=True
129129
)
130130
# should already be the case, but just to be sure
131131
df._consolidate_inplace()
132132

133133
# TODO: GH#33198 the setting here shoudlnt need two steps
134-
arr1 = np.random.randn(n_rows, int(n_cols / 4)).astype("f8")
135-
arr2 = np.random.randn(n_rows, int(n_cols / 2)).astype("i8")
136-
arr3 = np.random.randn(n_rows, int(n_cols / 4)).astype("f8")
134+
arr1 = np.random.randn(n_rows, n_cols // 4).astype("f8")
135+
arr2 = np.random.randn(n_rows, n_cols // 2).astype("i8")
136+
arr3 = np.random.randn(n_rows, n_cols // 4).astype("f8")
137137
df2 = pd.concat(
138138
[pd.DataFrame(arr1), pd.DataFrame(arr2), pd.DataFrame(arr3)],
139139
axis=1,

asv_bench/benchmarks/frame_methods.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ class Repr:
263263
def setup(self):
264264
nrows = 10000
265265
data = np.random.randn(nrows, 10)
266-
arrays = np.tile(np.random.randn(3, int(nrows / 100)), 100)
266+
arrays = np.tile(np.random.randn(3, nrows // 100), 100)
267267
idx = MultiIndex.from_arrays(arrays)
268268
self.df3 = DataFrame(data, index=idx)
269269
self.df4 = DataFrame(data, index=np.random.randn(nrows))
@@ -648,9 +648,9 @@ class Describe:
648648
def setup(self):
649649
self.df = DataFrame(
650650
{
651-
"a": np.random.randint(0, 100, int(1e6)),
652-
"b": np.random.randint(0, 100, int(1e6)),
653-
"c": np.random.randint(0, 100, int(1e6)),
651+
"a": np.random.randint(0, 100, 10 ** 6),
652+
"b": np.random.randint(0, 100, 10 ** 6),
653+
"c": np.random.randint(0, 100, 10 ** 6),
654654
}
655655
)
656656

asv_bench/benchmarks/groupby.py

+3
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,9 @@ def setup(self, data, key):
126126
def time_series_groups(self, data, key):
127127
self.ser.groupby(self.ser).groups
128128

129+
def time_series_indices(self, data, key):
130+
self.ser.groupby(self.ser).indices
131+
129132

130133
class GroupManyLabels:
131134

asv_bench/benchmarks/hash_functions.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -103,9 +103,9 @@ class Float64GroupIndex:
103103
# GH28303
104104
def setup(self):
105105
self.df = pd.date_range(
106-
start="1/1/2018", end="1/2/2018", periods=1e6
106+
start="1/1/2018", end="1/2/2018", periods=10 ** 6
107107
).to_frame()
108-
self.group_index = np.round(self.df.index.astype(int) / 1e9)
108+
self.group_index = np.round(self.df.index.astype(int) / 10 ** 9)
109109

110110
def time_groupby(self):
111111
self.df.groupby(self.group_index).last()

asv_bench/benchmarks/inference.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ class ToNumericDowncast:
4242
]
4343

4444
N = 500000
45-
N2 = int(N / 2)
45+
N2 = N // 2
4646

4747
data_dict = {
4848
"string-int": ["1"] * N2 + [2] * N2,

asv_bench/benchmarks/join_merge.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ def setup(self):
158158
daily_dates = date_index.to_period("D").to_timestamp("S", "S")
159159
self.fracofday = date_index.values - daily_dates.values
160160
self.fracofday = self.fracofday.astype("timedelta64[ns]")
161-
self.fracofday = self.fracofday.astype(np.float64) / 86400000000000.0
161+
self.fracofday = self.fracofday.astype(np.float64) / 86_400_000_000_000
162162
self.fracofday = Series(self.fracofday, daily_dates)
163163
index = date_range(date_index.min(), date_index.max(), freq="D")
164164
self.temp = Series(1.0, index)[self.fracofday.index]

asv_bench/benchmarks/rolling.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ class PeakMemFixedWindowMinMax:
171171
params = ["min", "max"]
172172

173173
def setup(self, operation):
174-
N = int(1e6)
174+
N = 10 ** 6
175175
arr = np.random.random(N)
176176
self.roll = pd.Series(arr).rolling(2)
177177

@@ -233,7 +233,7 @@ class GroupbyLargeGroups:
233233

234234
def setup(self):
235235
N = 100000
236-
self.df = pd.DataFrame({"A": [1, 2] * int(N / 2), "B": np.random.randn(N)})
236+
self.df = pd.DataFrame({"A": [1, 2] * (N // 2), "B": np.random.randn(N)})
237237

238238
def time_rolling_multiindex_creation(self):
239239
self.df.groupby("A").rolling(3).mean()

asv_bench/benchmarks/series_methods.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,7 @@ def time_dir_strings(self):
284284
class SeriesGetattr:
285285
# https://github.com/pandas-dev/pandas/issues/19764
286286
def setup(self):
287-
self.s = Series(1, index=date_range("2012-01-01", freq="s", periods=int(1e6)))
287+
self.s = Series(1, index=date_range("2012-01-01", freq="s", periods=10 ** 6))
288288

289289
def time_series_datetimeindex_repr(self):
290290
getattr(self.s, "a", None)

asv_bench/benchmarks/timeseries.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -346,7 +346,7 @@ def time_iso8601_tz_spaceformat(self):
346346
class ToDatetimeNONISO8601:
347347
def setup(self):
348348
N = 10000
349-
half = int(N / 2)
349+
half = N // 2
350350
ts_string_1 = "March 1, 2018 12:00:00+0400"
351351
ts_string_2 = "March 1, 2018 12:00:00+0500"
352352
self.same_offset = [ts_string_1] * N
@@ -376,7 +376,7 @@ def setup(self):
376376
self.same_offset = ["10/11/2018 00:00:00.045-07:00"] * N
377377
self.diff_offset = [
378378
f"10/11/2018 00:00:00.045-0{offset}:00" for offset in range(10)
379-
] * int(N / 10)
379+
] * (N // 10)
380380

381381
def time_exact(self):
382382
to_datetime(self.s2, format="%d%b%y")

doc/source/getting_started/comparison/comparison_with_sas.rst

+7-67
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ For potential users coming from `SAS <https://en.wikipedia.org/wiki/SAS_(softwar
88
this page is meant to demonstrate how different SAS operations would be
99
performed in pandas.
1010

11-
.. include:: comparison_boilerplate.rst
11+
.. include:: includes/introduction.rst
1212

1313
.. note::
1414

@@ -93,16 +93,7 @@ specifying the column names.
9393
;
9494
run;
9595
96-
A pandas ``DataFrame`` can be constructed in many different ways,
97-
but for a small number of values, it is often convenient to specify it as
98-
a Python dictionary, where the keys are the column names
99-
and the values are the data.
100-
101-
.. ipython:: python
102-
103-
df = pd.DataFrame({"x": [1, 3, 5], "y": [2, 4, 6]})
104-
df
105-
96+
.. include:: includes/construct_dataframe.rst
10697

10798
Reading external data
10899
~~~~~~~~~~~~~~~~~~~~~
@@ -217,12 +208,7 @@ or more columns.
217208
DATA step begins and can also be used in PROC statements */
218209
run;
219210
220-
DataFrames can be filtered in multiple ways; the most intuitive of which is using
221-
:ref:`boolean indexing <indexing.boolean>`
222-
223-
.. ipython:: python
224-
225-
tips[tips["total_bill"] > 10].head()
211+
.. include:: includes/filtering.rst
226212

227213
If/then logic
228214
~~~~~~~~~~~~~
@@ -239,18 +225,7 @@ In SAS, if/then logic can be used to create new columns.
239225
else bucket = 'high';
240226
run;
241227
242-
The same operation in pandas can be accomplished using
243-
the ``where`` method from ``numpy``.
244-
245-
.. ipython:: python
246-
247-
tips["bucket"] = np.where(tips["total_bill"] < 10, "low", "high")
248-
tips.head()
249-
250-
.. ipython:: python
251-
:suppress:
252-
253-
tips = tips.drop("bucket", axis=1)
228+
.. include:: includes/if_then.rst
254229

255230
Date functionality
256231
~~~~~~~~~~~~~~~~~~
@@ -278,28 +253,7 @@ functions pandas supports other Time Series features
278253
not available in Base SAS (such as resampling and custom offsets) -
279254
see the :ref:`timeseries documentation<timeseries>` for more details.
280255

281-
.. ipython:: python
282-
283-
tips["date1"] = pd.Timestamp("2013-01-15")
284-
tips["date2"] = pd.Timestamp("2015-02-15")
285-
tips["date1_year"] = tips["date1"].dt.year
286-
tips["date2_month"] = tips["date2"].dt.month
287-
tips["date1_next"] = tips["date1"] + pd.offsets.MonthBegin()
288-
tips["months_between"] = tips["date2"].dt.to_period("M") - tips[
289-
"date1"
290-
].dt.to_period("M")
291-
292-
tips[
293-
["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"]
294-
].head()
295-
296-
.. ipython:: python
297-
:suppress:
298-
299-
tips = tips.drop(
300-
["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"],
301-
axis=1,
302-
)
256+
.. include:: includes/time_date.rst
303257

304258
Selection of columns
305259
~~~~~~~~~~~~~~~~~~~~
@@ -349,14 +303,7 @@ Sorting in SAS is accomplished via ``PROC SORT``
349303
by sex total_bill;
350304
run;
351305
352-
pandas objects have a :meth:`~DataFrame.sort_values` method, which
353-
takes a list of columns to sort by.
354-
355-
.. ipython:: python
356-
357-
tips = tips.sort_values(["sex", "total_bill"])
358-
tips.head()
359-
306+
.. include:: includes/sorting.rst
360307

361308
String processing
362309
-----------------
@@ -377,14 +324,7 @@ functions. ``LENGTHN`` excludes trailing blanks and ``LENGTHC`` includes trailin
377324
put(LENGTHC(time));
378325
run;
379326
380-
Python determines the length of a character string with the ``len`` function.
381-
``len`` includes trailing blanks. Use ``len`` and ``rstrip`` to exclude
382-
trailing blanks.
383-
384-
.. ipython:: python
385-
386-
tips["time"].str.len().head()
387-
tips["time"].str.rstrip().str.len().head()
327+
.. include:: includes/length.rst
388328

389329

390330
Find

doc/source/getting_started/comparison/comparison_with_spreadsheets.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ terminology and link to documentation for Excel, but much will be the same/simil
1414
`Apple Numbers <https://www.apple.com/mac/numbers/compatibility/functions.html>`_, and other
1515
Excel-compatible spreadsheet software.
1616

17-
.. include:: comparison_boilerplate.rst
17+
.. include:: includes/introduction.rst
1818

1919
Data structures
2020
---------------

doc/source/getting_started/comparison/comparison_with_sql.rst

+3-18
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ Since many potential pandas users have some familiarity with
88
`SQL <https://en.wikipedia.org/wiki/SQL>`_, this page is meant to provide some examples of how
99
various SQL operations would be performed using pandas.
1010

11-
.. include:: comparison_boilerplate.rst
11+
.. include:: includes/introduction.rst
1212

1313
Most of the examples will utilize the ``tips`` dataset found within pandas tests. We'll read
1414
the data into a DataFrame called ``tips`` and assume we have a database table of the same name and
@@ -65,24 +65,9 @@ Filtering in SQL is done via a WHERE clause.
6565
6666
SELECT *
6767
FROM tips
68-
WHERE time = 'Dinner'
69-
LIMIT 5;
70-
71-
DataFrames can be filtered in multiple ways; the most intuitive of which is using
72-
:ref:`boolean indexing <indexing.boolean>`
73-
74-
.. ipython:: python
75-
76-
tips[tips["time"] == "Dinner"].head(5)
77-
78-
The above statement is simply passing a ``Series`` of True/False objects to the DataFrame,
79-
returning all rows with True.
80-
81-
.. ipython:: python
68+
WHERE time = 'Dinner';
8269
83-
is_dinner = tips["time"] == "Dinner"
84-
is_dinner.value_counts()
85-
tips[is_dinner].head(5)
70+
.. include:: includes/filtering.rst
8671

8772
Just like SQL's OR and AND, multiple conditions can be passed to a DataFrame using | (OR) and &
8873
(AND).

0 commit comments

Comments
 (0)