Skip to content

Commit 63168d9

Browse files
author
Marco Gorelli
committed
resolve conflict
2 parents 3ac9a9a + 39602e7 commit 63168d9

File tree

174 files changed

+7439
-4943
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

174 files changed

+7439
-4943
lines changed

asv_bench/benchmarks/categoricals.py

+14
Original file line numberDiff line numberDiff line change
@@ -282,4 +282,18 @@ def time_sort_values(self):
282282
self.index.sort_values(ascending=False)
283283

284284

285+
class SearchSorted:
286+
def setup(self):
287+
N = 10 ** 5
288+
self.ci = tm.makeCategoricalIndex(N).sort_values()
289+
self.c = self.ci.values
290+
self.key = self.ci.categories[1]
291+
292+
def time_categorical_index_contains(self):
293+
self.ci.searchsorted(self.key)
294+
295+
def time_categorical_contains(self):
296+
self.c.searchsorted(self.key)
297+
298+
285299
from .pandas_vb_common import setup # noqa: F401 isort:skip

ci/code_checks.sh

+11-3
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,10 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then
125125
# invgrep -R --include="*.py*" -E "from numpy import nan " pandas # GH#24822 not yet implemented since the offending imports have not all been removed
126126
RET=$(($RET + $?)) ; echo $MSG "DONE"
127127

128+
MSG='Check for use of exec' ; echo $MSG
129+
invgrep -R --include="*.py*" -E "[^a-zA-Z0-9_]exec\(" pandas
130+
RET=$(($RET + $?)) ; echo $MSG "DONE"
131+
128132
MSG='Check for pytest warns' ; echo $MSG
129133
invgrep -r -E --include '*.py' 'pytest\.warns' pandas/tests/
130134
RET=$(($RET + $?)) ; echo $MSG "DONE"
@@ -184,7 +188,7 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then
184188
invgrep -R --include="*.rst" ".. ipython ::" doc/source
185189
RET=$(($RET + $?)) ; echo $MSG "DONE"
186190

187-
MSG='Check that no file in the repo contains tailing whitespaces' ; echo $MSG
191+
MSG='Check that no file in the repo contains trailing whitespaces' ; echo $MSG
188192
set -o pipefail
189193
if [[ "$AZURE" == "true" ]]; then
190194
# we exclude all c/cpp files as the c/cpp files of pandas code base are tested when Linting .c and .h files
@@ -262,13 +266,17 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
262266
-k"-from_arrays -from_breaks -from_intervals -from_tuples -set_closed -to_tuples -interval_range"
263267
RET=$(($RET + $?)) ; echo $MSG "DONE"
264268

269+
MSG='Doctests arrays/string_.py' ; echo $MSG
270+
pytest -q --doctest-modules pandas/core/arrays/string_.py
271+
RET=$(($RET + $?)) ; echo $MSG "DONE"
272+
265273
fi
266274

267275
### DOCSTRINGS ###
268276
if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
269277

270-
MSG='Validate docstrings (GL03, GL04, GL05, GL06, GL07, GL09, GL10, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA05)' ; echo $MSG
271-
$BASE_DIR/scripts/validate_docstrings.py --format=azure --errors=GL03,GL04,GL05,GL06,GL07,GL09,GL10,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA05
278+
MSG='Validate docstrings (GL03, GL04, GL05, GL06, GL07, GL09, GL10, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA01, SA02, SA03, SA05)' ; echo $MSG
279+
$BASE_DIR/scripts/validate_docstrings.py --format=azure --errors=GL03,GL04,GL05,GL06,GL07,GL09,GL10,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA01,SA02,SA03,SA05
272280
RET=$(($RET + $?)) ; echo $MSG "DONE"
273281

274282
fi

ci/deps/azure-36-32bit.yaml

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ channels:
33
- defaults
44
- conda-forge
55
dependencies:
6+
- attrs=19.1.0
67
- gcc_linux-32
78
- gcc_linux-32
89
- gxx_linux-32
@@ -11,7 +12,7 @@ dependencies:
1112
- python=3.6.*
1213
- pytz=2017.2
1314
# universal
14-
- pytest>=4.0.2,<5.0.0
15+
- pytest
1516
- pytest-xdist
1617
- pytest-mock
1718
- pytest-azurepipelines

ci/deps/travis-36-cov.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ dependencies:
2929
- python-snappy
3030
- python=3.6.*
3131
- pytz
32-
- s3fs
32+
- s3fs<0.3
3333
- scikit-learn
3434
- scipy
3535
- sqlalchemy

ci/deps/travis-36-slow.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ dependencies:
1818
- python-dateutil
1919
- python=3.6.*
2020
- pytz
21-
- s3fs
21+
- s3fs<0.3
2222
- scipy
2323
- sqlalchemy
2424
- xlrd

ci/deps/travis-37.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ dependencies:
1717
- pytest-xdist>=1.29.0
1818
- pytest-mock
1919
- hypothesis>=3.58.0
20-
- s3fs
20+
- s3fs<0.3
2121
- pip
2222
- pyreadstat
2323
- pip:

ci/run_tests.sh

+5-6
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,9 @@ do
4343
# if no tests are found (the case of "single and slow"), pytest exits with code 5, and would make the script fail, if not for the below code
4444
sh -c "$PYTEST_CMD; ret=\$?; [ \$ret = 5 ] && exit 0 || exit \$ret"
4545

46-
# 2019-08-21 disabling because this is hitting HTTP 400 errors GH#27602
47-
# if [[ "$COVERAGE" && $? == 0 && "$TRAVIS_BRANCH" == "master" ]]; then
48-
# echo "uploading coverage for $TYPE tests"
49-
# echo "bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME"
50-
# bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME
51-
# fi
46+
if [[ "$COVERAGE" && $? == 0 && "$TRAVIS_BRANCH" == "master" ]]; then
47+
echo "uploading coverage for $TYPE tests"
48+
echo "bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME"
49+
bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME
50+
fi
5251
done

doc/.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
data/
2+
timeseries.csv
3+
timeseries.parquet
4+
timeseries_wide.parquet

doc/redirects.csv

+4
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ whatsnew,whatsnew/index
66
release,whatsnew/index
77

88
# getting started
9+
install,getting_started/install
910
10min,getting_started/10min
1011
basics,getting_started/basics
1112
comparison_with_r,getting_started/comparison/comparison_with_r
@@ -1577,3 +1578,6 @@ generated/pandas.unique,../reference/api/pandas.unique
15771578
generated/pandas.util.hash_array,../reference/api/pandas.util.hash_array
15781579
generated/pandas.util.hash_pandas_object,../reference/api/pandas.util.hash_pandas_object
15791580
generated/pandas.wide_to_long,../reference/api/pandas.wide_to_long
1581+
1582+
# Cached searches
1583+
reference/api/pandas.DataFrame.from_csv,pandas.read_csv

doc/source/conf.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@
191191

192192
# The theme to use for HTML and HTML Help pages. Major themes that come with
193193
# Sphinx are currently 'default' and 'sphinxdoc'.
194-
html_theme = "nature_with_gtoc"
194+
html_theme = "pandas_sphinx_theme"
195195

196196
# The style sheet to use for HTML and HTML Help pages. A file of that name
197197
# must exist either in Sphinx' static/ path, or in one of the custom paths
@@ -204,7 +204,7 @@
204204
# html_theme_options = {}
205205

206206
# Add any paths that contain custom themes here, relative to this directory.
207-
html_theme_path = ["themes"]
207+
# html_theme_path = ["themes"]
208208

209209
# The name for this set of Sphinx documents. If None, it defaults to
210210
# "<project> v<release> documentation".

doc/source/development/contributing.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -952,7 +952,7 @@ the expected correct result::
952952
Transitioning to ``pytest``
953953
~~~~~~~~~~~~~~~~~~~~~~~~~~~
954954

955-
*pandas* existing test structure is *mostly* classed based, meaning that you will typically find tests wrapped in a class.
955+
*pandas* existing test structure is *mostly* class-based, meaning that you will typically find tests wrapped in a class.
956956

957957
.. code-block:: python
958958

doc/source/ecosystem.rst

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
:orphan:
2+
13
.. _ecosystem:
24

35
{{ header }}

doc/source/getting_started/basics.rst

+16-3
Original file line numberDiff line numberDiff line change
@@ -986,7 +986,7 @@ not noted for a particular column will be ``NaN``:
986986
987987
tsdf.agg({'A': ['mean', 'min'], 'B': 'sum'})
988988
989-
.. _basics.aggregation.mixed_dtypes:
989+
.. _basics.aggregation.mixed_string:
990990

991991
Mixed dtypes
992992
++++++++++++
@@ -1704,14 +1704,21 @@ built-in string methods. For example:
17041704

17051705
.. ipython:: python
17061706
1707-
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
1707+
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'],
1708+
dtype="string")
17081709
s.str.lower()
17091710
17101711
Powerful pattern-matching methods are provided as well, but note that
17111712
pattern-matching generally uses `regular expressions
17121713
<https://docs.python.org/3/library/re.html>`__ by default (and in some cases
17131714
always uses them).
17141715

1716+
.. note::
1717+
1718+
Prior to pandas 1.0, string methods were only available on ``object`` -dtype
1719+
``Series``. Pandas 1.0 added the :class:`StringDtype` which is dedicated
1720+
to strings. See :ref:`text.types` for more.
1721+
17151722
Please see :ref:`Vectorized String Methods <text.string_methods>` for a complete
17161723
description.
17171724

@@ -1925,9 +1932,15 @@ period (time spans) :class:`PeriodDtype` :class:`Period` :class:`arrays.
19251932
sparse :class:`SparseDtype` (none) :class:`arrays.SparseArray` :ref:`sparse`
19261933
intervals :class:`IntervalDtype` :class:`Interval` :class:`arrays.IntervalArray` :ref:`advanced.intervalindex`
19271934
nullable integer :class:`Int64Dtype`, ... (none) :class:`arrays.IntegerArray` :ref:`integer_na`
1935+
Strings :class:`StringDtype` :class:`str` :class:`arrays.StringArray` :ref:`text`
19281936
=================== ========================= ================== ============================= =============================
19291937

1930-
Pandas uses the ``object`` dtype for storing strings.
1938+
Pandas has two ways to store strings.
1939+
1940+
1. ``object`` dtype, which can hold any Python object, including strings.
1941+
2. :class:`StringDtype`, which is dedicated to strings.
1942+
1943+
Generally, we recommend using :class:`StringDtype`. See :ref:`text.types` fore more.
19311944

19321945
Finally, arbitrary objects may be stored using the ``object`` dtype, but should
19331946
be avoided to the extent possible (for performance and interoperability with

doc/source/getting_started/index.rst

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ Getting started
1212
.. toctree::
1313
:maxdepth: 2
1414

15+
install
1516
overview
1617
10min
1718
basics
File renamed without changes.

doc/source/index.rst.template

+2-3
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,8 @@ See the :ref:`overview` for more detail about what's in the library.
4040
{% endif %}
4141
{% if not single_doc %}
4242
What's New in 1.0.0 <whatsnew/v1.0.0>
43-
install
4443
getting_started/index
4544
user_guide/index
46-
ecosystem
4745
{% endif -%}
4846
{% if include_api -%}
4947
reference/index
@@ -54,9 +52,9 @@ See the :ref:`overview` for more detail about what's in the library.
5452
{% endif %}
5553

5654
* :doc:`whatsnew/v1.0.0`
57-
* :doc:`install`
5855
* :doc:`getting_started/index`
5956

57+
* :doc:`getting_started/install`
6058
* :doc:`getting_started/overview`
6159
* :doc:`getting_started/10min`
6260
* :doc:`getting_started/basics`
@@ -83,6 +81,7 @@ See the :ref:`overview` for more detail about what's in the library.
8381
* :doc:`user_guide/style`
8482
* :doc:`user_guide/options`
8583
* :doc:`user_guide/enhancingperf`
84+
* :doc:`user_guide/scale`
8685
* :doc:`user_guide/sparse`
8786
* :doc:`user_guide/gotchas`
8887
* :doc:`user_guide/cookbook`

doc/source/reference/arrays.rst

+25-1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ Intervals :class:`IntervalDtype` :class:`Interval` :ref:`api.array
2424
Nullable Integer :class:`Int64Dtype`, ... (none) :ref:`api.arrays.integer_na`
2525
Categorical :class:`CategoricalDtype` (none) :ref:`api.arrays.categorical`
2626
Sparse :class:`SparseDtype` (none) :ref:`api.arrays.sparse`
27+
Strings :class:`StringDtype` :class:`str` :ref:`api.arrays.string`
2728
=================== ========================= ================== =============================
2829

2930
Pandas and third-party libraries can extend NumPy's type system (see :ref:`extending.extension-types`).
@@ -460,6 +461,29 @@ and methods if the :class:`Series` contains sparse values. See
460461
:ref:`api.series.sparse` for more.
461462

462463

464+
.. _api.arrays.string:
465+
466+
Text data
467+
---------
468+
469+
When working with text data, where each valid element is a string or missing,
470+
we recommend using :class:`StringDtype` (with the alias ``"string"``).
471+
472+
.. autosummary::
473+
:toctree: api/
474+
:template: autosummary/class_without_autosummary.rst
475+
476+
arrays.StringArray
477+
478+
.. autosummary::
479+
:toctree: api/
480+
:template: autosummary/class_without_autosummary.rst
481+
482+
StringDtype
483+
484+
The ``Series.str`` accessor is available for ``Series`` backed by a :class:`arrays.StringArray`.
485+
See :ref:`api.series.str` for more.
486+
463487

464488
.. Dtype attributes which are manually listed in their docstrings: including
465489
.. it here to make sure a docstring page is built for them
@@ -471,4 +495,4 @@ and methods if the :class:`Series` contains sparse values. See
471495
DatetimeTZDtype.unit
472496
DatetimeTZDtype.tz
473497
PeriodDtype.freq
474-
IntervalDtype.subtype
498+
IntervalDtype.subtype

doc/source/reference/indexing.rst

-1
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,6 @@ Selecting
166166
Index.get_slice_bound
167167
Index.get_value
168168
Index.get_values
169-
Index.set_value
170169
Index.isin
171170
Index.slice_indexer
172171
Index.slice_locs

doc/source/user_guide/index.rst

+1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ Further information on any specific method can be obtained in the
3838
style
3939
options
4040
enhancingperf
41+
scale
4142
sparse
4243
gotchas
4344
cookbook

doc/source/user_guide/io.rst

+5-2
Original file line numberDiff line numberDiff line change
@@ -4710,7 +4710,8 @@ Several caveats.
47104710
indexes. This extra column can cause problems for non-Pandas consumers that are not expecting it. You can
47114711
force including or omitting indexes with the ``index`` argument, regardless of the underlying engine.
47124712
* Index level names, if specified, must be strings.
4713-
* Categorical dtypes can be serialized to parquet, but will de-serialize as ``object`` dtype.
4713+
* In the ``pyarrow`` engine, categorical dtypes for non-string types can be serialized to parquet, but will de-serialize as their primitive dtype.
4714+
* The ``pyarrow`` engine preserves the ``ordered`` flag of categorical dtypes with string types. ``fastparquet`` does not preserve the ``ordered`` flag.
47144715
* Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message
47154716
on an attempt at serialization.
47164717

@@ -4734,7 +4735,9 @@ See the documentation for `pyarrow <https://arrow.apache.org/docs/python/>`__ an
47344735
'd': np.arange(4.0, 7.0, dtype='float64'),
47354736
'e': [True, False, True],
47364737
'f': pd.date_range('20130101', periods=3),
4737-
'g': pd.date_range('20130101', periods=3, tz='US/Eastern')})
4738+
'g': pd.date_range('20130101', periods=3, tz='US/Eastern'),
4739+
'h': pd.Categorical(list('abc')),
4740+
'i': pd.Categorical(list('abc'), ordered=True)})
47384741
47394742
df
47404743
df.dtypes

0 commit comments

Comments
 (0)