From 5f4170dfc62ffb93b8909e0c20174984eed4f5f1 Mon Sep 17 00:00:00 2001 From: Pedro Diogo Date: Fri, 6 Jun 2025 04:08:43 +0100 Subject: [PATCH 1/3] feature #58141: Consistent naming conventions for string dtype aliases Key implementation steps: - Created factory functions (string, datetime, integer, floating, decimal, boolean, list, categorical, interval, period, sparse, date, duration, map, struct) to generate pandas dtypes (e.g., StringDtype, Int64Dtype, ArrowDtype) based on parameters like backend, bits, unit, and precision. - Added support for both NumPy and PyArrow backends, enabling seamless switching (e.g., integer() returns Int64Dtype for NumPy or ArrowDtype(pa.int64()) for PyArrow). - Implemented parameter validation to ensure correct usage (e.g., validating mode in string() to be "string" or "binary", and unit in datetime() for NumPy). - Integrated PyArrow types for advanced dtypes (e.g., pa.float64(), pa.list_(), pa.map_()), supporting modern data processing frameworks. - Implemented comprehensive tests in test_factory.py to validate dtype creation across all functions, ensuring correct behavior for different backends, verifying string representations (e.g., "double[pyarrow]" for pa.float64()), and confirming proper error handling (e.g., raising ValueError for invalid inputs). - Addressed PyArrow compatibility by implementing correct method calls, such as using pa.bool_() for boolean dtypes, ensuring proper integration. This change simplifies dtype creation, reduces duplication, and ensures compatibility across backends, making it easier to extend support for new dtypes in the future. Co-authored-by: Pedro Santos --- pandas/__init__.py | 34 ++ pandas/core/dtypes/factory.py | 737 ++++++++++++++++++++++++++++ pandas/tests/dtypes/test_factory.py | 302 ++++++++++++ 3 files changed, 1073 insertions(+) create mode 100644 pandas/core/dtypes/factory.py create mode 100644 pandas/tests/dtypes/test_factory.py diff --git a/pandas/__init__.py b/pandas/__init__.py index 8b92ad6cdfebb..dfbc1894c441d 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -42,6 +42,25 @@ # let init-time option registration happen import pandas.core.config_init # pyright: ignore[reportUnusedImport] # noqa: F401 +# Import the new factory functions +from pandas.core.dtypes.factory import ( + boolean, + categorical, + date, + datetime, + decimal, + duration, + floating, + integer, + interval, + list, + map, + period, + sparse, + string, + struct, +) + from pandas.core.api import ( # dtype ArrowDtype, @@ -281,24 +300,35 @@ "array", "arrays", "bdate_range", + "boolean", + "categorical", "concat", "crosstab", "cut", + "date", "date_range", + "datetime", + "decimal", "describe_option", + "duration", "errors", "eval", "factorize", + "floating", "from_dummies", "get_dummies", "get_option", "infer_freq", + "integer", + "interval", "interval_range", "io", "isna", "isnull", "json_normalize", + "list", "lreshape", + "map", "melt", "merge", "merge_asof", @@ -308,6 +338,7 @@ "offsets", "option_context", "options", + "period", "period_range", "pivot", "pivot_table", @@ -337,6 +368,9 @@ "set_eng_float_format", "set_option", "show_versions", + "sparse", + "string", + "struct", "test", "testing", "timedelta_range", diff --git a/pandas/core/dtypes/factory.py b/pandas/core/dtypes/factory.py new file mode 100644 index 0000000000000..824ec722b9460 --- /dev/null +++ b/pandas/core/dtypes/factory.py @@ -0,0 +1,737 @@ +""" +Factory functions for creating pandas dtypes with consistent naming conventions. +""" + +from __future__ import annotations + +from typing import ( + Any, + Literal, +) + +import numpy as np +import pyarrow as pa + +from pandas._libs import missing as libmissing + +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + DatetimeTZDtype, + IntervalDtype, + PeriodDtype, + SparseDtype, +) + +from pandas.core.api import ( + ArrowDtype, + BooleanDtype, + Float32Dtype, + Float64Dtype, + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, +) +from pandas.core.arrays.string_ import StringDtype + + +def string( + backend: Literal["python", "pyarrow"] = "python", + large: bool = False, + mode: Literal["string", "binary"] = "string", + na_value: Any = libmissing.NA, +) -> StringDtype | ArrowDtype: + """ + Create a string or binary dtype with specified backend and NA value. + + Parameters + ---------- + backend : {"python", "pyarrow"}, default "python" + The backend to use for string or binary data. + large : bool, default False + If True and backend is "pyarrow", uses pa.large_string() or pa.large_binary(). + mode : {"string", "binary"}, default "string" + Whether to create a string or binary dtype. + na_value : {pd.NA, np.nan}, default pd.NA + The value to use for missing data. Must be either pd.NA or np.nan. + + Returns + ------- + StringDtype or ArrowDtype + A string or binary dtype with the specified configuration. + + Examples + -------- + >>> string() # Default python backend with pd.NA + string[python] + >>> string(backend="pyarrow", mode="string") # PyArrow string backend + string[pyarrow] + >>> string(backend="pyarrow", mode="string", large=True) # PyArrow large string + large_string[pyarrow] + >>> string(backend="pyarrow", mode="binary") # PyArrow binary + binary[pyarrow] + >>> string(backend="pyarrow", mode="binary", large=True) # PyArrow large binary + large_binary[pyarrow] + """ + valid_modes = ["string", "binary"] + if mode not in valid_modes: + raise ValueError(f"mode must be one of {valid_modes}, got {mode}") + if backend == "pyarrow": + if mode == "string": + pa_type = pa.large_string() if large else pa.string() + else: # mode == "binary" + pa_type = pa.large_binary() if large else pa.binary() + return ArrowDtype(pa_type) + if mode == "binary": + raise ValueError("Binary mode is only supported with PyArrow backend.") + return StringDtype(storage="python", na_value=na_value) + + +def datetime( + unit: str = "ns", + tz: Any | None = None, + backend: Literal["numpy", "pyarrow"] = "numpy", +) -> np.dtype | DatetimeTZDtype | ArrowDtype: + """ + Create a datetime dtype with specified unit, timezone and backend. + + Parameters + ---------- + unit : str, default "ns" + The datetime unit to use. + tz : str, int, or datetime.tzinfo, optional + The timezone to use. + backend : {"numpy", "pyarrow"}, default "numpy" + The backend to use for datetime storage. + + Returns + ------- + Union[np.dtype, DatetimeTZDtype, ArrowDtype] + A datetime dtype with the specified configuration. + + Examples + -------- + >>> pd.datetime() # Default numpy backend with ns unit + datetime64[ns] + >>> pd.datetime(unit="us") # Microsecond precision + datetime64[us] + >>> pd.datetime(tz="UTC") # Timezone-aware datetime + datetime64[ns, UTC] + >>> pd.datetime(backend="pyarrow") # PyArrow backend + timestamp[ns][pyarrow] + """ + valid_units = ["D", "h", "m", "s", "ms", "us", "ns"] + if backend == "numpy": + if unit not in valid_units: + raise ValueError(f"unit must be one of {valid_units}, got {unit}") + if tz is not None: + return DatetimeTZDtype(unit=unit, tz=tz) + return np.dtype(f"datetime64[{unit}]") + else: # pyarrow + return ArrowDtype(pa.timestamp(unit, tz=tz)) + + +def integer( + bits: int = 64, + backend: Literal["numpy", "pyarrow", "pandas"] = "pandas", +) -> Int8Dtype | Int16Dtype | Int32Dtype | Int64Dtype | ArrowDtype | np.dtype[Any]: + """ + Create an integer dtype with specified bits and backend. + + Parameters + ---------- + bits : int, default 64 + The number of bits for the integer type. Must be one of 8, 16, 32, or 64. + backend : {"pandas", "numpy", "pyarrow"}, default "pandas" + The backend to use for integer storage. + + Returns + ------- + Union[Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, ArrowDtype] + An integer dtype with the specified configuration. + + Examples + -------- + >>> integer() # Default: 64 bits with pandas backend + Int64 + >>> integer(bits=32) # 32-bit integer with pandas backend + Int32 + >>> integer(bits=64, backend="numpy") # 64-bit integer with NumPy backend + dtype('int64') + >>> integer(bits=64, backend="pyarrow") # 64-bit integer with PyArrow backend + int64[pyarrow] + """ + valid_bits = [8, 16, 32, 64] + if bits not in valid_bits: + raise ValueError(f"bits must be one of {valid_bits}, got {bits}") + + if backend == "numpy": + return np.dtype(f"int{bits}") + + if backend == "pandas": + if bits == 8: + return Int8Dtype() + elif bits == 16: + return Int16Dtype() + elif bits == 32: + return Int32Dtype() + elif bits == 64: + return Int64Dtype() + elif backend == "pyarrow": + if bits == 8: + return ArrowDtype(pa.int8()) + elif bits == 16: + return ArrowDtype(pa.int16()) + elif bits == 32: + return ArrowDtype(pa.int32()) + elif bits == 64: + return ArrowDtype(pa.int64()) + else: + raise ValueError(f"Unsupported backend: {backend!r}") + + +def floating( + bits: int = 64, + backend: Literal["numpy", "pyarrow", "pandas"] = "pandas", +) -> Float32Dtype | Float64Dtype | ArrowDtype | np.dtype[Any]: + """ + Create a floating-point dtype with specified bits and backend. + + Parameters + ---------- + bits : int, default 64 + The number of bits for the floating-point type. Must be one of 32 or 64. + backend : {"numpy", "pyarrow"}, default "numpy" + The backend to use for floating-point storage. + + Returns + ------- + Union[Float32Dtype, Float64Dtype, ArrowDtype] + A floating-point dtype with the specified configuration. + + Examples + -------- + >>> floating() # Default: 64 bits with NumPy backend + Float64 + >>> floating(bits=32) # 32-bit float with NumPy backend + Float32 + >>> floating(bits=64, backend="pyarrow") # 64-bit float with PyArrow backend + float64[pyarrow] + """ + valid_bits = [32, 64] + if bits not in valid_bits: + raise ValueError(f"bits must be one of {valid_bits}, got {bits}") + + if backend == "numpy": + return np.dtype(f"float{bits}") + + if backend == "pandas": + if bits == 32: + return Float32Dtype() + elif bits == 64: + return Float64Dtype() + elif backend == "pyarrow": + if bits == 32: + return ArrowDtype(pa.float32()) + elif bits == 64: + return ArrowDtype(pa.float64()) + else: + raise ValueError(f"Unsupported backend: {backend!r}") + + +def decimal( + precision: int, + scale: int, + backend: Literal["pyarrow"] = "pyarrow", +) -> ArrowDtype: + """ + Create a decimal dtype with specified precision and scale. + + Parameters + ---------- + precision : int + The total number of digits in the decimal number. + scale : int + The number of digits to the right of the decimal point. + backend : {"pyarrow"}, default "pyarrow" + The backend to use for decimal storage. Only PyArrow is supported. + + Returns + ------- + ArrowDtype + A decimal dtype with the specified configuration. + + Examples + -------- + >>> decimal(precision=10, scale=2) # Decimal with 10 digits, + ... # 2 after the decimal point + decimal128[10, 2][pyarrow] + >>> decimal(precision=40, scale=5) # Larger precision, uses decimal256 + decimal256[40, 5][pyarrow] + """ + if backend == "pyarrow": + if precision <= 38: + return ArrowDtype(pa.decimal128(precision, scale)) + return ArrowDtype(pa.decimal256(precision, scale)) + raise ValueError("Decimal types are only supported with PyArrow backend.") + + +def boolean( + backend: Literal["numpy", "pyarrow"] = "numpy", +) -> BooleanDtype | ArrowDtype: + """ + Create a boolean dtype with specified backend. + + Parameters + ---------- + backend : {"numpy", "pyarrow"}, default "numpy" + The backend to use for boolean storage. + + Returns + ------- + Union[BooleanDtype, ArrowDtype] + A boolean dtype with the specified configuration. + + Examples + -------- + >>> boolean() # Default: NumPy backend + boolean + >>> boolean(backend="pyarrow") # PyArrow backend + bool[pyarrow] + """ + if backend == "numpy": + return BooleanDtype() + else: # pyarrow + return ArrowDtype(pa.bool_()) + + +def list( + value_type: Any = None, + large: bool = False, + backend: Literal["numpy", "pyarrow"] = "numpy", +) -> np.dtype | ArrowDtype: + """ + Create a list dtype with specified value type, size, and backend. + + Parameters + ---------- + value_type : Any, optional + The type of the list elements (e.g., pa.int64(), pa.string()). If None, + defaults to object (NumPy) or int64 (PyArrow). + large : bool, default False + If True and backend is "pyarrow", uses pa.large_list() instead of pa.list_(). + backend : {"numpy", "pyarrow"}, default "numpy" + The backend to use for list storage. + + Returns + ------- + Union[np.dtype, ArrowDtype] + A list dtype with the specified configuration. + + Examples + -------- + >>> list() # Default numpy backend + object + >>> list(backend="pyarrow") # PyArrow backend with default int64 + list[int64][pyarrow] + >>> list(value_type=pa.string(), backend="pyarrow") # PyArrow with string + list[string][pyarrow] + >>> list( + ... value_type=pa.string(), large=True, backend="pyarrow" + ... ) # PyArrow large list + large_list[string][pyarrow] + """ + if backend == "numpy": + return np.dtype("object") + else: # pyarrow + if value_type is None: + value_type = pa.int64() + pa_type = pa.large_list(value_type) if large else pa.list_(value_type) + return ArrowDtype(pa_type) + + +def categorical( + categories: list[Any] | None = None, + ordered: bool = False, + index_type: Any = None, + value_type: Any = None, + backend: Literal["numpy", "pyarrow"] = "numpy", +) -> CategoricalDtype | ArrowDtype: + """ + Create a categorical dtype with specified categories, ordering, and backend. + + Parameters + ---------- + categories : list, optional + The categories for the categorical dtype. + ordered : bool, default False + Whether the categories are ordered. + index_type : Any, optional + The type of the dictionary indices (PyArrow only, e.g., pa.int32()). + Defaults to pa.int32() if None. + value_type : Any, optional + The type of the dictionary values (PyArrow only, e.g., pa.string()). + Defaults to pa.string() if None. + backend : {"numpy", "pyarrow"}, default "numpy" + The backend to use for categorical storage. + + Returns + ------- + Union[CategoricalDtype, ArrowDtype] + A categorical dtype with the specified configuration. + + Examples + -------- + >>> categorical() # Default numpy backend + category + >>> categorical(categories=["a", "b", "c"]) # With categories + category + >>> categorical(ordered=True) # Ordered categories + category + >>> categorical(backend="pyarrow") # PyArrow backend + dictionary[pyarrow] + >>> categorical(index_type=pa.int64(), value_type=pa.int32(), backend="pyarrow") + dictionary[pyarrow] + """ + if backend == "numpy": + return CategoricalDtype(categories=categories, ordered=ordered) + else: # pyarrow + index_type = pa.int32() if index_type is None else index_type + value_type = pa.string() if value_type is None else value_type + return ArrowDtype(pa.dictionary(index_type, value_type)) + + +def interval( + subtype: Any = None, + closed: Literal["left", "right", "both", "neither"] = "right", + backend: Literal["numpy", "pyarrow"] = "numpy", +) -> IntervalDtype | ArrowDtype: + """ + Create an interval dtype with specified subtype and closed bounds. + + Parameters + ---------- + subtype : dtype, optional + The dtype of the interval bounds. + closed : {"left", "right", "both", "neither"}, default "right" + Whether the interval is closed on the left, right, both or neither. + backend : {"numpy", "pyarrow"}, default "numpy" + The backend to use for interval storage. + + Returns + ------- + Union[IntervalDtype, ArrowDtype] + An interval dtype with the specified configuration. + + Examples + -------- + >>> interval() # Default numpy backend + interval + >>> interval(subtype="int64") # With specific subtype + interval[int64] + >>> interval(closed="both") # Closed on both sides + interval[both] + >>> interval(backend="pyarrow") # PyArrow backend + interval[pyarrow] + """ + if backend == "numpy": + return IntervalDtype(subtype=subtype, closed=closed) + else: # pyarrow + if subtype is not None: + return ArrowDtype( + pa.struct( + [ + ("left", pa.from_numpy_dtype(subtype)), + ("right", pa.from_numpy_dtype(subtype)), + ] + ) + ) + return ArrowDtype(pa.struct([("left", pa.float64()), ("right", pa.float64())])) + + +def period( + freq: str = "D", + backend: Literal["numpy", "pyarrow"] = "numpy", +) -> PeriodDtype | ArrowDtype: + """ + Create a period dtype with specified frequency. + + Parameters + ---------- + freq : str, default "D" + The frequency of the period. Common values are: + - "D" for daily + - "M" for monthly + - "Y" for yearly + - "H" for hourly + - "T" for minute + - "S" for second + backend : {"numpy", "pyarrow"}, default "numpy" + The backend to use for period storage. + + Returns + ------- + Union[PeriodDtype, ArrowDtype] + A period dtype with the specified configuration. + + Notes + ----- + PyArrow backend uses `month_day_nano_interval` for periods, which represents + intervals in terms of months, days, and nanoseconds. + + Examples + -------- + >>> period() # Default numpy backend with daily frequency + period[D] + >>> period(freq="M") # Monthly frequency + period[M] + >>> period(backend="pyarrow") # PyArrow backend + month_day_nano_interval[pyarrow] + """ + if backend == "numpy": + return PeriodDtype(freq=freq) + else: # pyarrow + return ArrowDtype(pa.month_day_nano_interval()) + + +def sparse( + dtype: Any = None, + fill_value: Any = None, + backend: Literal["numpy"] = "numpy", +) -> np.dtype | SparseDtype: + """ + Create a sparse dtype with specified dtype and fill value. + + Parameters + ---------- + dtype : dtype, optional + The dtype of the non-sparse values. If None, defaults to float64. + fill_value : scalar, optional + The value to use for missing values. If None, defaults to np.nan for float + and 0 for integer dtypes. + backend : {"numpy"}, default "numpy" + The backend to use for sparse storage. Only NumPy is supported, as PyArrow + does not have a native sparse type. + + Returns + ------- + np.dtype + A sparse dtype with the specified configuration. + + Examples + -------- + >>> sparse() # Default numpy backend + Sparse[float64, nan] + >>> sparse(dtype="int64") # With specific dtype + Sparse[int64, 0] + >>> sparse(fill_value=-1) # With specific fill value + Sparse[float64, -1.0] + """ + if backend != "numpy": + raise ValueError( + "Sparse types are only supported with NumPy backend, as PyArrow " + "does not have a native sparse type." + ) + + if dtype is None: + dtype = np.float64 + if fill_value is None: + fill_value = np.nan if np.issubdtype(dtype, np.floating) else 0 + return SparseDtype(dtype=dtype, fill_value=fill_value) + + +def date( + unit: Literal["day", "ms"] = "day", + backend: Literal["pyarrow"] = "pyarrow", +) -> ArrowDtype: + """ + Create a date dtype with specified unit, using PyArrow backend. + + This function creates a dtype for representing dates without time components, + suitable for calendar-based data. PyArrow provides two date types: `date32` for + day precision (stored as days since UNIX epoch) and `date64` for millisecond + precision (stored as milliseconds since UNIX epoch). NumPy does not natively + support a date-only type, so only PyArrow backend is supported. + + Parameters + ---------- + unit : {"day", "ms"}, default "day" + The precision unit for the date: + - "day": Uses `date32`, representing dates as days since UNIX epoch + (1970-01-01). + - "ms": Uses `date64`, representing dates as milliseconds since UNIX epoch. + backend : {"pyarrow"}, default "pyarrow" + The backend to use for date storage. Only PyArrow is supported. + + Returns + ------- + ArrowDtype + A date dtype with the specified configuration, wrapped in an ArrowDtype. + + Raises + ------ + ValueError + If a backend other than "pyarrow" is specified. + + Examples + -------- + >>> date() # Default day precision with PyArrow + date32[pyarrow] + >>> date(unit="ms") # Millisecond precision with PyArrow + date64[pyarrow] + >>> import pandas as pd + >>> pd.Series( + ... [pd.Timestamp("2023-01-01"), pd.Timestamp("2023-01-02")], dtype=date() + ... ) + 0 2023-01-01 + 1 2023-01-02 + dtype: date32[pyarrow] + """ + + if backend != "pyarrow": + raise ValueError("Date types are only supported with PyArrow backend.") + return ArrowDtype(pa.date32() if unit == "day" else pa.date64()) + + +def duration( + unit: Literal["ns", "us", "ms", "s"] = "ns", + backend: Literal["numpy", "pyarrow"] = "pyarrow", +) -> np.dtype | ArrowDtype: + """ + Create a duration dtype with specified unit and backend. + + Parameters + ---------- + unit : {"ns", "us", "ms", "s"}, default "ns" + The unit of precision for the duration: + - "ns": Nanoseconds. + - "us": Microseconds. + - "ms": Milliseconds. + - "s": Seconds. + backend : {"numpy", "pyarrow"}, default "pyarrow" + The backend to use for duration storage. + + Returns + ------- + Union[np.dtype, ArrowDtype] + A duration dtype with the specified configuration. + + Examples + -------- + >>> duration() # Default PyArrow backend + duration[ns][pyarrow] + >>> duration(unit="s", backend="numpy") # NumPy backend + timedelta64[s] + """ + valid_units = ["ns", "us", "ms", "s"] + if unit not in valid_units: + raise ValueError(f"Unit must be one of {valid_units}") + if backend == "numpy": + return np.dtype(f"timedelta64[{unit}]") + else: # pyarrow + return ArrowDtype(pa.duration(unit)) + + +def map( + index_type: Any, + value_type: Any, + backend: Literal["pyarrow"] = "pyarrow", +) -> ArrowDtype: + """ + Create a map dtype with specified index and value types, using PyArrow backend. + + This function creates a dtype for representing key-value mappings (similar to a + dictionary), where each element is a list of key-value pairs. PyArrow's `map` type + ensures that keys are unique within each element. + This type is not natively supported by NumPy, so only PyArrow backend is supported. + + Parameters + ---------- + index_type : Any + The type of the map's keys (e.g., `pa.int32()`, `pa.string()`). + value_type : Any + The type of the map's values (e.g., `pa.float64()`, `pa.string()`). + backend : {"pyarrow"}, default "pyarrow" + The backend to use for map storage. Only PyArrow is supported. + + Returns + ------- + ArrowDtype + A map dtype with the specified configuration, wrapped in an ArrowDtype. + + Raises + ------ + ValueError + If a backend other than "pyarrow" is specified. + + Examples + -------- + >>> map(index_type=pa.int32(), value_type=pa.string()) + map[pyarrow] + >>> import pandas as pd + >>> data = [[(1, "a"), (2, "b")], [(3, "c")]] + >>> pd.Series(data, dtype=map(pa.int32(), pa.string())) + 0 [(1, a), (2, b)] + 1 [(3, c)] + dtype: map[pyarrow] + """ + if backend != "pyarrow": + raise ValueError("Map types are only supported with PyArrow backend.") + return ArrowDtype(pa.map_(index_type, value_type)) + + +def struct( + fields: list[tuple[str, Any]], + backend: Literal["pyarrow"] = "pyarrow", +) -> ArrowDtype: + """ + Create a struct dtype with specified fields, using PyArrow backend. + + This function creates a dtype for representing structured data, where each element + is a record with named fields, similar to a named tuple or dictionary. Each field + in the struct has a name and a type, defined in the `fields` parameter. PyArrow's + `struct` type is used to store this data, allowing for nested structures. NumPy does + not natively support structured types in the same way, so only PyArrow backend is + supported. + + Parameters + ---------- + fields : list of tuple[str, Any] + A list of (name, type) tuples defining the fields of the struct, where: + - `name` is a string representing the field name. + - `type` is a PyArrow type (e.g., `pa.int32()`, `pa.string()`). + backend : {"pyarrow"}, default "pyarrow" + The backend to use for struct storage. Only PyArrow is supported. + + Returns + ------- + ArrowDtype + A struct dtype with the specified configuration, wrapped in an ArrowDtype. + + Raises + ------ + ValueError + If a backend other than "pyarrow" is specified. + + Examples + -------- + >>> struct([("id", pa.int32()), ("name", pa.string())]) + struct[pyarrow] + >>> import pandas as pd + >>> data = [(1, "Alice"), (2, "Bob")] + >>> pd.Series(data, dtype=struct([("id", pa.int32()), ("name", pa.string())])) + 0 (1, Alice) + 1 (2, Bob) + dtype: struct[pyarrow] + """ + if backend != "pyarrow": + raise ValueError("Struct types are only supported with PyArrow backend.") + # Validate that fields is a list of (str, type) tuples + for field in fields: + if ( + not isinstance(field, tuple) + or len(field) != 2 + or not isinstance(field[0], str) + ): + raise ValueError("Each field must be a tuple of (str, type), got {field}") + return ArrowDtype(pa.struct(fields)) diff --git a/pandas/tests/dtypes/test_factory.py b/pandas/tests/dtypes/test_factory.py new file mode 100644 index 0000000000000..abf3c5bceeb7d --- /dev/null +++ b/pandas/tests/dtypes/test_factory.py @@ -0,0 +1,302 @@ +import numpy as np +import pyarrow as pa +import pytest + +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + DatetimeTZDtype, + IntervalDtype, + PeriodDtype, + SparseDtype, +) +from pandas.core.dtypes.factory import ( + boolean, + categorical, + date, + datetime, + decimal, + duration, + floating, + integer, + interval, + list, + map, + period, + sparse, + string, + struct, +) + +from pandas import Series +from pandas.core.api import ( + ArrowDtype, + BooleanDtype, + Float32Dtype, + Float64Dtype, + Int32Dtype, + Int64Dtype, + StringDtype, +) + + +# String +def test_string_default(): + result = string() + assert result == StringDtype() + assert str(result) == "string" + + +def test_string_with_mode(): + result = string(mode="binary", backend="pyarrow") + assert result == ArrowDtype(pa.binary()) + assert str(result) == "binary[pyarrow]" + + +def test_string_invalid_mode(): + with pytest.raises(ValueError, match="mode must be one of"): + string(mode="invalid", backend="pyarrow") + + +# Datetime +def test_datetime_default(): + result = datetime() + assert result == np.dtype("datetime64[ns]") + assert isinstance(result, np.dtype) + + +def test_datetime_with_tz(): + result = datetime(tz="UTC") + assert isinstance(result, DatetimeTZDtype) + assert str(result) == "datetime64[ns, UTC]" + + +def test_datetime_pyarrow(): + result = datetime(backend="pyarrow") + assert isinstance(result, ArrowDtype) + assert str(result) == "timestamp[ns][pyarrow]" + + +def test_datetime_invalid_unit(): + with pytest.raises(ValueError, match="unit must be one of"): + datetime(unit="invalid", backend="numpy") + + +# Integer +def test_integer_default(): + result = integer() + assert result == Int64Dtype() + assert str(result) == "Int64" + + +def test_integer_with_bits(): + result = integer(bits=32) + assert result == Int32Dtype() + assert str(result) == "Int32" + + +def test_integer_numpy(): + result = integer(bits=64, backend="numpy") + assert result == np.dtype("int64") + assert str(result) == "int64" + + +def test_integer_pyarrow(): + result = integer(bits=64, backend="pyarrow") + assert isinstance(result, ArrowDtype) + assert str(result) == "int64[pyarrow]" + + +# Floating +def test_floating_default(): + result = floating() + assert result == Float64Dtype() + assert str(result) == "Float64" + + +def test_floating_with_bits(): + result = floating(bits=32) + assert result == Float32Dtype() + assert str(result) == "Float32" + + +def test_floating_numpy(): + result = floating(bits=64, backend="numpy") + assert result == np.dtype("float64") + assert str(result) == "float64" + + +def test_floating_pyarrow(): + result = floating(bits=64, backend="pyarrow") + assert isinstance(result, ArrowDtype) + assert str(result) == "double[pyarrow]" + + +# Decimal +def test_decimal_default(): + result = decimal(precision=38, scale=10) + assert isinstance(result, ArrowDtype) + assert str(result) == "decimal128(38, 10)[pyarrow]" + + +def test_decimal_with_precision_scale(): + result = decimal(precision=10, scale=2) + assert isinstance(result, ArrowDtype) + assert str(result) == "decimal128(10, 2)[pyarrow]" + + +# Boolean +def test_boolean_default(): + result = boolean() + assert result == BooleanDtype() + assert str(result) == "boolean" + + +def test_boolean_pyarrow(): + result = boolean(backend="pyarrow") + assert isinstance(result, ArrowDtype) + assert str(result) == "bool[pyarrow]" + + +# List +def test_list_default(): + result = list() + assert result == np.dtype("object") + assert isinstance(result, np.dtype) + + +def test_list_pyarrow(): + result = list(backend="pyarrow", value_type=pa.int64()) + assert isinstance(result, ArrowDtype) + assert str(result) == "list[pyarrow]" + + +def test_list_large(): + result = list(backend="pyarrow", value_type=pa.string(), large=True) + assert isinstance(result, ArrowDtype) + assert str(result) == "large_list[pyarrow]" + + +# Categorical +def test_categorical_default(): + result = categorical() + assert isinstance(result, CategoricalDtype) + assert str(result) == "category" + + +def test_categorical_pyarrow(): + result = categorical(backend="pyarrow") + assert isinstance(result, ArrowDtype) + assert str(result) == "dictionary[pyarrow]" + + +# Interval +def test_interval_default(): + result = interval() + assert isinstance(result, IntervalDtype) + assert str(result) == "interval" + + +def test_interval_pyarrow(): + result = interval(backend="pyarrow") + assert isinstance(result, ArrowDtype) + assert str(result) == "struct[pyarrow]" + + +# Period +def test_period_default(): + result = period() + assert isinstance(result, PeriodDtype) + assert str(result) == "period[D]" + + +def test_period_pyarrow(): + result = period(backend="pyarrow") + assert isinstance(result, ArrowDtype) + assert str(result) == "month_day_nano_interval[pyarrow]" + + +# Date +def test_date_default(): + result = date() + assert isinstance(result, ArrowDtype) + assert str(result) == "date32[day][pyarrow]" + + +def test_date_pyarrow(): + result = date(backend="pyarrow") + assert isinstance(result, ArrowDtype) + assert str(result) == "date32[day][pyarrow]" + + +# Duration +def test_duration_default(): + result = duration() + assert isinstance(result, ArrowDtype) + assert str(result) == "duration[ns][pyarrow]" + + +def test_duration_pyarrow(): + result = duration(backend="pyarrow") + assert isinstance(result, ArrowDtype) + assert str(result) == "duration[ns][pyarrow]" + + +# Map +def test_map_default(): + result = map(index_type=pa.string(), value_type=pa.int64()) + assert isinstance(result, ArrowDtype) + assert str(result) == "map[pyarrow]" + + +def test_map_custom_types(): + result = map(index_type=pa.string(), value_type=pa.float64()) + assert isinstance(result, ArrowDtype) + assert str(result) == "map[pyarrow]" + + +# Struct +def test_struct_default(): + result = struct(fields=[("a", pa.int64()), ("b", pa.string())]) + assert isinstance(result, ArrowDtype) + assert str(result) == "struct[pyarrow]" + + +def test_struct_custom_fields(): + fields = [("x", pa.float32()), ("y", pa.int16())] + result = struct(fields=fields) + assert isinstance(result, ArrowDtype) + assert str(result) == "struct[pyarrow]" + + +# Sparse +def test_sparse_default(): + result = sparse() + assert result == SparseDtype(np.float64, fill_value=np.nan) + assert isinstance(result, SparseDtype) + assert str(result) == "Sparse[float64, nan]" + + +def test_sparse_with_dtype(): + result = sparse(dtype=np.int64) + assert result == SparseDtype(np.int64, fill_value=0) + assert str(result) == "Sparse[int64, 0]" + + +def test_sparse_with_fill_value(): + result = sparse(fill_value=-1) + assert result == SparseDtype(np.float64, fill_value=-1) + assert str(result) == "Sparse[float64, -1]" + + +def test_sparse_backend_invalid(): + with pytest.raises( + ValueError, match="Sparse types are only supported with NumPy backend" + ): + sparse(backend="pyarrow") + + +def test_sparse_series_creation(): + data = [1.0, None, None, 3.0, None] + s_sparse = Series(data, dtype=sparse()) + assert s_sparse.dtype == SparseDtype(np.float64, fill_value=np.nan) + assert s_sparse.memory_usage() < Series(data, dtype=np.float64).memory_usage() From 1e8353e21de8655a2bbb4fb23a38c4ebc1bc788c Mon Sep 17 00:00:00 2001 From: Pedro Diogo Date: Sat, 21 Jun 2025 18:45:20 +0100 Subject: [PATCH 2/3] fixed import pyarrow --- pandas/core/dtypes/factory.py | 59 +++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 20 deletions(-) diff --git a/pandas/core/dtypes/factory.py b/pandas/core/dtypes/factory.py index 824ec722b9460..364f2d84cd59c 100644 --- a/pandas/core/dtypes/factory.py +++ b/pandas/core/dtypes/factory.py @@ -10,7 +10,6 @@ ) import numpy as np -import pyarrow as pa from pandas._libs import missing as libmissing @@ -77,6 +76,8 @@ def string( if mode not in valid_modes: raise ValueError(f"mode must be one of {valid_modes}, got {mode}") if backend == "pyarrow": + import pyarrow as pa + if mode == "string": pa_type = pa.large_string() if large else pa.string() else: # mode == "binary" @@ -128,6 +129,8 @@ def datetime( return DatetimeTZDtype(unit=unit, tz=tz) return np.dtype(f"datetime64[{unit}]") else: # pyarrow + import pyarrow as pa + return ArrowDtype(pa.timestamp(unit, tz=tz)) @@ -167,24 +170,25 @@ def integer( if backend == "numpy": return np.dtype(f"int{bits}") - - if backend == "pandas": + elif backend == "pandas": if bits == 8: return Int8Dtype() elif bits == 16: return Int16Dtype() elif bits == 32: return Int32Dtype() - elif bits == 64: + else: # bits == 64 return Int64Dtype() elif backend == "pyarrow": + import pyarrow as pa + if bits == 8: return ArrowDtype(pa.int8()) elif bits == 16: return ArrowDtype(pa.int16()) elif bits == 32: return ArrowDtype(pa.int32()) - elif bits == 64: + else: # bits == 64 return ArrowDtype(pa.int64()) else: raise ValueError(f"Unsupported backend: {backend!r}") @@ -224,16 +228,17 @@ def floating( if backend == "numpy": return np.dtype(f"float{bits}") - - if backend == "pandas": + elif backend == "pandas": if bits == 32: return Float32Dtype() - elif bits == 64: + else: # bits == 64 return Float64Dtype() elif backend == "pyarrow": + import pyarrow as pa + if bits == 32: return ArrowDtype(pa.float32()) - elif bits == 64: + else: # bits == 64 return ArrowDtype(pa.float64()) else: raise ValueError(f"Unsupported backend: {backend!r}") @@ -270,6 +275,8 @@ def decimal( decimal256[40, 5][pyarrow] """ if backend == "pyarrow": + import pyarrow as pa + if precision <= 38: return ArrowDtype(pa.decimal128(precision, scale)) return ArrowDtype(pa.decimal256(precision, scale)) @@ -302,6 +309,8 @@ def boolean( if backend == "numpy": return BooleanDtype() else: # pyarrow + import pyarrow as pa + return ArrowDtype(pa.bool_()) @@ -344,6 +353,8 @@ def list( if backend == "numpy": return np.dtype("object") else: # pyarrow + import pyarrow as pa + if value_type is None: value_type = pa.int64() pa_type = pa.large_list(value_type) if large else pa.list_(value_type) @@ -396,6 +407,8 @@ def categorical( if backend == "numpy": return CategoricalDtype(categories=categories, ordered=ordered) else: # pyarrow + import pyarrow as pa + index_type = pa.int32() if index_type is None else index_type value_type = pa.string() if value_type is None else value_type return ArrowDtype(pa.dictionary(index_type, value_type)) @@ -437,6 +450,8 @@ def interval( if backend == "numpy": return IntervalDtype(subtype=subtype, closed=closed) else: # pyarrow + import pyarrow as pa + if subtype is not None: return ArrowDtype( pa.struct( @@ -491,6 +506,8 @@ def period( if backend == "numpy": return PeriodDtype(freq=freq) else: # pyarrow + import pyarrow as pa + return ArrowDtype(pa.month_day_nano_interval()) @@ -590,6 +607,8 @@ def date( if backend != "pyarrow": raise ValueError("Date types are only supported with PyArrow backend.") + import pyarrow as pa + return ArrowDtype(pa.date32() if unit == "day" else pa.date64()) @@ -629,6 +648,8 @@ def duration( if backend == "numpy": return np.dtype(f"timedelta64[{unit}]") else: # pyarrow + import pyarrow as pa + return ArrowDtype(pa.duration(unit)) @@ -677,6 +698,8 @@ def map( """ if backend != "pyarrow": raise ValueError("Map types are only supported with PyArrow backend.") + import pyarrow as pa + return ArrowDtype(pa.map_(index_type, value_type)) @@ -724,14 +747,10 @@ def struct( 1 (2, Bob) dtype: struct[pyarrow] """ - if backend != "pyarrow": - raise ValueError("Struct types are only supported with PyArrow backend.") - # Validate that fields is a list of (str, type) tuples - for field in fields: - if ( - not isinstance(field, tuple) - or len(field) != 2 - or not isinstance(field[0], str) - ): - raise ValueError("Each field must be a tuple of (str, type), got {field}") - return ArrowDtype(pa.struct(fields)) + if backend == "pyarrow": + import pyarrow as pa + + pa_fields = [(name, getattr(typ, "pyarrow_dtype", typ)) for name, typ in fields] + return ArrowDtype(pa.struct(pa_fields)) + else: + raise ValueError(f"Unsupported backend: {backend!r}") From 3dcf97eb0ed8297a8060b32848ea05022077086e Mon Sep 17 00:00:00 2001 From: Pedro Diogo Date: Sat, 21 Jun 2025 18:57:26 +0100 Subject: [PATCH 3/3] fixed pyarrow tests --- pandas/tests/dtypes/test_factory.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/tests/dtypes/test_factory.py b/pandas/tests/dtypes/test_factory.py index abf3c5bceeb7d..3f96c8768924f 100644 --- a/pandas/tests/dtypes/test_factory.py +++ b/pandas/tests/dtypes/test_factory.py @@ -1,5 +1,4 @@ import numpy as np -import pyarrow as pa import pytest from pandas.core.dtypes.dtypes import ( @@ -47,6 +46,7 @@ def test_string_default(): def test_string_with_mode(): + pa = pytest.importorskip("pyarrow") result = string(mode="binary", backend="pyarrow") assert result == ArrowDtype(pa.binary()) assert str(result) == "binary[pyarrow]" @@ -165,12 +165,14 @@ def test_list_default(): def test_list_pyarrow(): + pa = pytest.importorskip("pyarrow") result = list(backend="pyarrow", value_type=pa.int64()) assert isinstance(result, ArrowDtype) assert str(result) == "list[pyarrow]" def test_list_large(): + pa = pytest.importorskip("pyarrow") result = list(backend="pyarrow", value_type=pa.string(), large=True) assert isinstance(result, ArrowDtype) assert str(result) == "large_list[pyarrow]" @@ -243,12 +245,14 @@ def test_duration_pyarrow(): # Map def test_map_default(): + pa = pytest.importorskip("pyarrow") result = map(index_type=pa.string(), value_type=pa.int64()) assert isinstance(result, ArrowDtype) assert str(result) == "map[pyarrow]" def test_map_custom_types(): + pa = pytest.importorskip("pyarrow") result = map(index_type=pa.string(), value_type=pa.float64()) assert isinstance(result, ArrowDtype) assert str(result) == "map[pyarrow]" @@ -256,12 +260,14 @@ def test_map_custom_types(): # Struct def test_struct_default(): + pa = pytest.importorskip("pyarrow") result = struct(fields=[("a", pa.int64()), ("b", pa.string())]) assert isinstance(result, ArrowDtype) assert str(result) == "struct[pyarrow]" def test_struct_custom_fields(): + pa = pytest.importorskip("pyarrow") fields = [("x", pa.float32()), ("y", pa.int16())] result = struct(fields=fields) assert isinstance(result, ArrowDtype)