Merge branch 'main' into 37715-remove-mypy-ignore

natmokval · natmokval · commit 5b850acf1b98 · 2023-03-29T11:45:05.000+02:00
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -524,6 +524,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         pandas.api.extensions.ExtensionArray.insert \
         pandas.api.extensions.ExtensionArray.isin \
         pandas.api.extensions.ExtensionArray.isna \
+        pandas.api.extensions.ExtensionArray.map \
         pandas.api.extensions.ExtensionArray.ravel \
         pandas.api.extensions.ExtensionArray.searchsorted \
         pandas.api.extensions.ExtensionArray.shift \
diff --git a/doc/source/getting_started/index.rst b/doc/source/getting_started/index.rst
@@ -533,7 +533,7 @@ Data sets do not only contain numerical data. pandas provides a wide range of fu
 Coming from...
 --------------
 
-Are you familiar with other software for manipulating tablular data? Learn
+Are you familiar with other software for manipulating tabular data? Learn
 the pandas-equivalent operations compared to software you already know:
 
 .. panels::
diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst
@@ -53,6 +53,7 @@ objects.
       api.extensions.ExtensionArray.insert
       api.extensions.ExtensionArray.isin
       api.extensions.ExtensionArray.isna
+      api.extensions.ExtensionArray.map
       api.extensions.ExtensionArray.ravel
       api.extensions.ExtensionArray.repeat
       api.extensions.ExtensionArray.searchsorted
diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst
@@ -322,7 +322,7 @@ As usual, **both sides** of the slicers are included as this is label indexing.
 .. warning::
 
    You should specify all axes in the ``.loc`` specifier, meaning the indexer for the **index** and
-   for the **columns**. There are some ambiguous cases where the passed indexer could be mis-interpreted
+   for the **columns**. There are some ambiguous cases where the passed indexer could be misinterpreted
    as indexing *both* axes, rather than into say the ``MultiIndex`` for the rows.
 
    You should do this:
diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
@@ -149,7 +149,7 @@ the columns except the one we specify:
    grouped.sum()
 
 The above GroupBy will split the DataFrame on its index (rows). To split by columns, first do
-a tranpose:
+a transpose:
 
 .. ipython::
 
diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
@@ -507,14 +507,18 @@ used if a custom frequency string is passed.
 Timestamp limitations
 ---------------------
 
-Since pandas represents timestamps in nanosecond resolution, the time span that
+The limits of timestamp representation depend on the chosen resolution. For
+nanosecond resolution, the time span that
 can be represented using a 64-bit integer is limited to approximately 584 years:
 
 .. ipython:: python
 
    pd.Timestamp.min
    pd.Timestamp.max
 
+When choosing second-resolution, the available range grows to  ``+/- 2.9e11 years``.
+Different resolutions can be converted to each other through ``as_unit``.
+
 .. seealso::
 
    :ref:`timeseries.oob`
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -209,6 +209,7 @@ I/O
 ^^^
 - Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`)
 - :meth:`DataFrame.to_orc` now raising ``ValueError`` when non-default :class:`Index` is given (:issue:`51828`)
+- Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`)
 -
 
 Period
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -1491,7 +1491,7 @@ def validate_func_kwargs(
     Returns
     -------
     columns : List[str]
-        List of user-provied keys.
+        List of user-provided keys.
     func : List[Union[str, callable[...,Any]]]
         List of user-provided aggfuncs
 
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -251,7 +251,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal
             except pa.ArrowInvalid:
                 # GH50430: let pyarrow infer type, then cast
                 scalars = pa.array(scalars, from_pandas=True)
-        if pa_dtype:
+        if pa_dtype and scalars.type != pa_dtype:
             scalars = scalars.cast(pa_dtype)
         return cls(scalars)
 
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -1719,6 +1719,12 @@ def map(self, mapper, na_action=None):
             The output of the mapping function applied to the array.
             If the function returns a tuple with more than one element
             a MultiIndex will be returned.
+
+        Examples
+        --------
+        >>> ext_arr = pd.array([1, 2, 3])
+        >>> ext_arr.map(str)
+        array(['1', '2', '3'], dtype=object)
         """
         return map_array(self, mapper, na_action=na_action)
 
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
@@ -711,13 +711,13 @@ def register_converter_cb(key) -> None:
 styler_max_rows = """
 : int, optional
     The maximum number of rows that will be rendered. May still be reduced to
-    satsify ``max_elements``, which takes precedence.
+    satisfy ``max_elements``, which takes precedence.
 """
 
 styler_max_columns = """
 : int, optional
     The maximum number of columns that will be rendered. May still be reduced to
-    satsify ``max_elements``, which takes precedence.
+    satisfy ``max_elements``, which takes precedence.
 """
 
 styler_precision = """
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -1697,7 +1697,7 @@ def pandas_dtype(dtype) -> DtypeObj:
     try:
         with warnings.catch_warnings():
             # GH#51523 - Series.astype(np.integer) doesn't show
-            # numpy deprication warning of np.integer
+            # numpy deprecation warning of np.integer
             # Hence enabling DeprecationWarning
             warnings.simplefilter("always", DeprecationWarning)
             npdtype = np.dtype(dtype)
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -901,7 +901,7 @@ def _parse_dtype_strict(cls, freq: str_type) -> BaseOffset:
                 return freq_offset
 
         raise TypeError(
-            "PeriodDtype argument should be string or BaseOffet, "
+            "PeriodDtype argument should be string or BaseOffset, "
             f"got {type(freq).__name__}"
         )
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -6559,7 +6559,7 @@ def infer_objects(self, copy: bool_t | None = None) -> Self:
         Parameters
         ----------
         copy : bool, default True
-            Whether to make a copy for non-object or non-inferrable columns
+            Whether to make a copy for non-object or non-inferable columns
             or Series.
 
         Returns
diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py
@@ -89,7 +89,7 @@ def get_sheet_data(
         file_rows_needed: int | None = None,
     ) -> list[list[Scalar]]:
         data: list[list[Scalar]] = []
-        prevous_row_number = -1
+        previous_row_number = -1
         # When sparse=True the rows can have different lengths and empty rows are
         # not returned. The cells are namedtuples of row, col, value (r, c, v).
         for row in sheet.rows(sparse=True):
@@ -99,9 +99,9 @@ def get_sheet_data(
                 # trim trailing empty elements
                 converted_row.pop()
             if converted_row:
-                data.extend([[]] * (row_number - prevous_row_number - 1))
+                data.extend([[]] * (row_number - previous_row_number - 1))
                 data.append(converted_row)
-                prevous_row_number = row_number
+                previous_row_number = row_number
             if file_rows_needed is not None and len(data) >= file_rows_needed:
                 break
         if data:
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
@@ -740,7 +740,7 @@ def _calc_max_rows_fitted(self) -> int | None:
             _, height = get_terminal_size()
             if self.max_rows == 0:
                 # rows available to fill with actual data
-                return height - self._get_number_of_auxillary_rows()
+                return height - self._get_number_of_auxiliary_rows()
 
             if self._is_screen_short(height):
                 max_rows = height
@@ -775,7 +775,7 @@ def _is_screen_narrow(self, max_width) -> bool:
     def _is_screen_short(self, max_height) -> bool:
         return bool(self.max_rows == 0 and len(self.frame) > max_height)
 
-    def _get_number_of_auxillary_rows(self) -> int:
+    def _get_number_of_auxiliary_rows(self) -> int:
         """Get number of rows occupied by prompt, dots and dimension info."""
         dot_row = 1
         prompt_row = 1
diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -582,7 +582,6 @@ def __init__(self, *args, **kwargs) -> None:
     def _parse_tables(self, doc, match, attrs):
         element_name = self._strainer.name
         tables = doc.find_all(element_name, attrs=attrs)
-
         if not tables:
             raise ValueError("No tables found")
 
@@ -592,13 +591,15 @@ def _parse_tables(self, doc, match, attrs):
 
         for table in tables:
             if self.displayed_only:
+                for elem in table.find_all("style"):
+                    elem.decompose()
+
                 for elem in table.find_all(style=re.compile(r"display:\s*none")):
                     elem.decompose()
 
             if table not in unique_tables and table.find(string=match) is not None:
                 result.append(table)
             unique_tables.add(table)
-
         if not result:
             raise ValueError(f"No tables found matching pattern {repr(match.pattern)}")
         return result
@@ -730,10 +731,11 @@ def _parse_tables(self, doc, match, kwargs):
                 # lxml utilizes XPATH 1.0 which does not have regex
                 # support. As a result, we find all elements with a style
                 # attribute and iterate them to check for display:none
+                for elem in table.xpath(".//style"):
+                    elem.drop_tree()
                 for elem in table.xpath(".//*[@style]"):
                     if "display:none" in elem.attrib.get("style", "").replace(" ", ""):
                         elem.drop_tree()
-
         if not tables:
             raise ValueError(f"No tables found matching regex {repr(pattern)}")
         return tables
@@ -1170,6 +1172,7 @@ def read_html(
             '{None, "header", "footer", "body", "all"}, got '
             f'"{extract_links}"'
         )
+
     validate_header_arg(header)
     check_dtype_backend(dtype_backend)
 
diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py
@@ -86,11 +86,11 @@ def test_comparisons(self, factor):
             cat_rev > cat_rev_base2
 
         # Only categories with same ordering information can be compared
-        cat_unorderd = cat.set_ordered(False)
+        cat_unordered = cat.set_ordered(False)
         assert not (cat > cat).any()
 
         with pytest.raises(TypeError, match=msg):
-            cat > cat_unorderd
+            cat > cat_unordered
 
         # comparison (in both directions) with Series will raise
         s = Series(["b", "b", "b"])
diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py
@@ -523,7 +523,7 @@ def test_freq_argument_required(self):
         with pytest.raises(TypeError, match=msg):
             PeriodDtype()
 
-        msg = "PeriodDtype argument should be string or BaseOffet, got NoneType"
+        msg = "PeriodDtype argument should be string or BaseOffset, got NoneType"
         with pytest.raises(TypeError, match=msg):
             # GH#51790
             PeriodDtype(None)
diff --git a/pandas/tests/frame/methods/test_isetitem.py b/pandas/tests/frame/methods/test_isetitem.py
@@ -38,7 +38,7 @@ def test_isetitem_ea_df_scalar_indexer(self):
         )
         tm.assert_frame_equal(df, expected)
 
-    def test_isetitem_dimension_missmatch(self):
+    def test_isetitem_dimension_mismatch(self):
         # GH#51701
         df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]})
         value = df.copy()
diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py
@@ -2006,7 +2006,7 @@ def test_inplace_arithmetic_series_update(using_copy_on_write):
         tm.assert_frame_equal(df, expected)
 
 
-def test_arithemetic_multiindex_align():
+def test_arithmetic_multiindex_align():
     """
     Regression test for: https://github.com/pandas-dev/pandas/issues/33765
     """
diff --git a/pandas/tests/frame/test_npfuncs.py b/pandas/tests/frame/test_npfuncs.py
@@ -11,7 +11,7 @@
 
 
 class TestAsArray:
-    def test_asarray_homogenous(self):
+    def test_asarray_homogeneous(self):
         df = DataFrame({"A": Categorical([1, 2]), "B": Categorical([1, 2])})
         result = np.asarray(df)
         # may change from object in the future
diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py
@@ -84,7 +84,7 @@ def test_invert_mixed(self):
         )
         tm.assert_frame_equal(result, expected)
 
-    def test_invert_empy_not_input(self):
+    def test_invert_empty_not_input(self):
         # GH#51032
         df = pd.DataFrame()
         result = ~df
diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py
@@ -603,12 +603,12 @@ def test_filter_non_bool_raises():
 def test_filter_dropna_with_empty_groups():
     # GH 10780
     data = Series(np.random.rand(9), index=np.repeat([1, 2, 3], 3))
-    groupped = data.groupby(level=0)
-    result_false = groupped.filter(lambda x: x.mean() > 1, dropna=False)
+    grouped = data.groupby(level=0)
+    result_false = grouped.filter(lambda x: x.mean() > 1, dropna=False)
     expected_false = Series([np.nan] * 9, index=np.repeat([1, 2, 3], 3))
     tm.assert_series_equal(result_false, expected_false)
 
-    result_true = groupped.filter(lambda x: x.mean() > 1, dropna=True)
+    result_true = grouped.filter(lambda x: x.mean() > 1, dropna=True)
     expected_true = Series(index=pd.Index([], dtype=int), dtype=np.float64)
     tm.assert_series_equal(result_true, expected_true)
 
diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py
@@ -320,7 +320,7 @@ def test_readjson_nrows_chunks(request, nrows, chunksize, engine):
 
 def test_readjson_nrows_requires_lines(engine):
     # GH 33916
-    # Test ValuError raised if nrows is set without setting lines in read_json
+    # Test ValueError raised if nrows is set without setting lines in read_json
     jsonl = """{"a": 1, "b": 2}
         {"a": 3, "b": 4}
         {"a": 5, "b": 6}
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
@@ -1495,3 +1495,28 @@ def test_invalid_dtype_backend(self):
         )
         with pytest.raises(ValueError, match=msg):
             read_html("test", dtype_backend="numpy")
+
+    def test_style_tag(self):
+        # GH 48316
+        data = """
+        <table>
+            <tr>
+                <th>
+                    <style>.style</style>
+                    A
+                    </th>
+                <th>B</th>
+            </tr>
+            <tr>
+                <td>A1</td>
+                <td>B1</td>
+            </tr>
+            <tr>
+                <td>A2</td>
+                <td>B2</td>
+            </tr>
+        </table>
+        """
+        result = self.read_html(data)[0]
+        expected = DataFrame(data=[["A1", "B1"], ["A2", "B2"]], columns=["A", "B"])
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py
@@ -983,12 +983,12 @@ def test_df_axis_param_depr():
     index.name = "date"
     df = DataFrame(np.random.rand(10, 2), columns=list("AB"), index=index).T
 
-    # Deprication error when axis=1 is explicitly passed
+    # Deprecation error when axis=1 is explicitly passed
     warning_msg = "DataFrame.resample with axis=1 is deprecated."
     with tm.assert_produces_warning(FutureWarning, match=warning_msg):
         df.resample("M", axis=1)
 
-    # Deprication error when axis=0 is explicitly passed
+    # Deprecation error when axis=0 is explicitly passed
     df = df.T
     warning_msg = (
         "The 'axis' keyword in DataFrame.resample is deprecated and "
diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py
@@ -338,7 +338,7 @@ def test_concat_mixed_objs(self):
         result = concat([s1, df, s2], ignore_index=True)
         tm.assert_frame_equal(result, expected)
 
-    def test_dtype_coerceion(self):
+    def test_dtype_coercion(self):
         # 12411
         df = DataFrame({"date": [pd.Timestamp("20130101").tz_localize("UTC"), pd.NaT]})
 
diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py
@@ -730,7 +730,7 @@ def test_constructor_fromisocalendar(self):
         assert isinstance(result, Timestamp)
 
 
-def test_constructor_ambigous_dst():
+def test_constructor_ambiguous_dst():
     # GH 24329
     # Make sure that calling Timestamp constructor
     # on Timestamp created from ambiguous time
diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py
@@ -29,7 +29,7 @@
 
 class TestTimestampUnaryOps:
     # --------------------------------------------------------------
-    def test_round_divison_by_zero_raises(self):
+    def test_round_division_by_zero_raises(self):
         ts = Timestamp("2016-01-01")
 
         msg = "Division by zero in rounding"
diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py
@@ -576,7 +576,7 @@ def test_getitem_dataframe_raises():
         ser[df > 5]
 
 
-def test_getitem_assignment_series_aligment():
+def test_getitem_assignment_series_alignment():
     # https://github.com/pandas-dev/pandas/issues/37427
     # with getitem, when assigning with a Series, it is not first aligned
     ser = Series(range(10))
diff --git a/pandas/tests/series/methods/test_sort_values.py b/pandas/tests/series/methods/test_sort_values.py
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
diff --git a/pandas/tests/util/test_assert_index_equal.py b/pandas/tests/util/test_assert_index_equal.py
diff --git a/pyproject.toml b/pyproject.toml

Original file line number	Diff line number	Diff line change
`@@ -209,6 +209,7 @@ I/O`
`209`	`209`	`^^^`
`210`	`210`	- Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`)
`211`	`211`	- :meth:`DataFrame.to_orc` now raising ``ValueError`` when non-default :class:`Index` is given (:issue:`51828`)
	`212`	+- Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`)
`212`	`213`	`-`
`213`	`214`
`214`	`215`	`Period`
Original file line number	Diff line number	Diff line change
`@@ -901,7 +901,7 @@ def _parse_dtype_strict(cls, freq: str_type) -> BaseOffset:`
`901`	`901`	`return freq_offset`
`902`	`902`
`903`	`903`	`raise TypeError(`
`904`		`- "PeriodDtype argument should be string or BaseOffet, "`
	`904`	`+ "PeriodDtype argument should be string or BaseOffset, "`
`905`	`905`	`f"got {type(freq).__name__}"`
`906`	`906`	`)`
`907`	`907`
Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@ def test_isetitem_ea_df_scalar_indexer(self):`
`38`	`38`	`)`
`39`	`39`	`tm.assert_frame_equal(df, expected)`
`40`	`40`
`41`		`- def test_isetitem_dimension_missmatch(self):`
	`41`	`+ def test_isetitem_dimension_mismatch(self):`
`42`	`42`	`# GH#51701`
`43`	`43`	`df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]})`
`44`	`44`	`value = df.copy()`
Original file line number	Diff line number	Diff line change
`@@ -84,7 +84,7 @@ def test_invert_mixed(self):`
`84`	`84`	`)`
`85`	`85`	`tm.assert_frame_equal(result, expected)`
`86`	`86`
`87`		`- def test_invert_empy_not_input(self):`
	`87`	`+ def test_invert_empty_not_input(self):`
`88`	`88`	`# GH#51032`
`89`	`89`	`df = pd.DataFrame()`
`90`	`90`	`result = ~df`