pandas-dev · AlexHodgson · Oct 9, 2023 · Oct 10, 2023 · Oct 11, 2023 · Oct 11, 2023
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -14,7 +14,24 @@ including other versions of pandas.
 Enhancements
 ~~~~~~~~~~~~
 
-.. _whatsnew_300.enhancements.enhancement1:
+.. _whatsnew_300.enhancements.to_numeric_separators:
+
+Custom Thousand and Decimal Separators added to :func:`to_numeric`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Added two new parameters, ``decimal`` and ``thousands`` to :func:`to_numeric`, allowing users to specify custom decimal points and thousand separators. (:issue:`#4674`).
+``decimal`` has default value ``'.''`` and ``thousands`` has default value ``None``, meaning the string would contain no symbols to demarcate groups of thousands and use ``.`` as the decimal point. This default behaviour is the same as :func:`to_numeric` before the parameters were added.
+:func:`to_numeric` will now be able to parse number strings such as '1,000,000' or '1.000,5' if the user provides the correct parameters.
+
+An example of the new functionality:
+
+.. code-block:: python
+
+    >>> s = pd.Series(['1,5', '2.000.000', -3])
+    >>> pd.to_numeric(s, thousands='.', decimal=',')
+    0   1.5
+    1   2000000.0
+    2   -3.0
+    dtype: float64
 
 enhancement1
 ^^^^^^^^^^^^

diff --git a/pandas/_libs/include/pandas/parser/pd_parser.h b/pandas/_libs/include/pandas/parser/pd_parser.h
@@ -17,8 +17,8 @@ extern "C" {
 #include <Python.h>
 
 typedef struct {
-  int (*to_double)(char *, double *, char, char, int *);
-  int (*floatify)(PyObject *, double *, int *);
+  int (*to_double)(char *, double *, char, char, char, int *);
+  int (*floatify)(PyObject *, double *, int *, char, char);
   void *(*new_rd_source)(PyObject *);
   void (*del_rd_source)(void *);
   char *(*buffer_rd_bytes)(void *, size_t, size_t *, int *, const char *);
@@ -58,8 +58,8 @@ static PandasParser_CAPI *PandasParserAPI = NULL;
 
 #define to_double(item, p_value, sci, decimal, maybe_int)                      \
   PandasParserAPI->to_double((item), (p_value), (sci), (decimal), (maybe_int))
-#define floatify(str, result, maybe_int)                                       \
-  PandasParserAPI->floatify((str), (result), (maybe_int))
+#define floatify(str, result, maybe_int, dec, tsep)                            \
+  PandasParserAPI->floatify((str), (result), (maybe_int), (dec), (tsep))
 #define new_rd_source(obj) PandasParserAPI->new_rd_source((obj))
 #define del_rd_source(src) PandasParserAPI->del_rd_source((src))
 #define buffer_rd_bytes(source, nbytes, bytes_read, status, encoding_errors)   \

diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
@@ -124,6 +124,8 @@ def maybe_convert_numeric(
     convert_empty: bool = ...,
     coerce_numeric: bool = ...,
     convert_to_masked_nullable: Literal[False] = ...,
+    thousands: str | None = ...,
+    decimal: str = ...,
 ) -> tuple[np.ndarray, None]: ...
 @overload
 def maybe_convert_numeric(
@@ -133,6 +135,8 @@ def maybe_convert_numeric(
     coerce_numeric: bool = ...,
     *,
     convert_to_masked_nullable: Literal[True],
+    thousands: str | None = ...,
+    decimal: str = ...,
 ) -> tuple[np.ndarray, np.ndarray]: ...
 
 # TODO: restrict `arr`?

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -69,7 +69,8 @@ from pandas._libs.interval import Interval
 
 
 cdef extern from "pandas/parser/pd_parser.h":
-    int floatify(object, float64_t *result, int *maybe_int) except -1
+    int floatify(object, float64_t *result, int *maybe_int,
+                 char dec, char tsep) except -1
     void PandasParser_IMPORT()
 
 PandasParser_IMPORT
@@ -2204,6 +2205,8 @@ def maybe_convert_numeric(
     bint convert_empty=True,
     bint coerce_numeric=False,
     bint convert_to_masked_nullable=False,
+    str thousands=None,
+    str decimal="."
 ) -> tuple[np.ndarray, np.ndarray | None]:
     """
     Convert object array to a numeric array if possible.
@@ -2231,6 +2234,14 @@ def maybe_convert_numeric(
     convert_to_masked_nullable : bool, default False
         Whether to return a mask for the converted values. This also disables
         upcasting for ints with nulls to float64.
+    thousands : str, default None
+        Character used to separate groups of thousands for readability,
+        e.g. ',' in 1,000,000
+        Must only be 1 character long.
+    decimal : str, default '.'
+        Character used to separate decimal section from the integer
+        section of the number, e.g., '.' in 12.45
+        Must only be 1 character long.
     Returns
     -------
     np.ndarray
@@ -2247,6 +2258,28 @@ def maybe_convert_numeric(
     cdef:
         object val = values[0]
 
+    # Convert python strings into ones readable by C
+
+    cdef char* tsep
+    cdef char* dsep
+    # Use null char to represent lack of separator
+    if thousands is None:
+        tsep = "\0"
+    else:
+        bytes_tsep = thousands.encode("UTF-8")
+        tsep = bytes_tsep
+
+    bytes_dsep = decimal.encode("UTF-8")
+    dsep = bytes_dsep
+
+    # Validate separators
+    if len(tsep) > 1:
+        raise ValueError("Thousands separator must not exceed length 1")
+    if len(dsep) > 1:
+        raise ValueError("Decimal separator must have length 1")
+    if tsep == dsep:
+        raise ValueError("Decimal and thousand separators must not be the same")
+
     if util.is_integer_object(val):
         try:
             maybe_ints = values.astype("i8")
@@ -2354,8 +2387,7 @@ def maybe_convert_numeric(
             seen.float_ = True
         else:
             try:
-                floatify(val, &fval, &maybe_int)
-
+                floatify(val, &fval, &maybe_int, dsep[0], tsep[0])
                 if fval in na_values:
                     seen.saw_null()
                     floats[i] = complexes[i] = NaN
@@ -2368,7 +2400,10 @@ def maybe_convert_numeric(
                     floats[i] = fval
 
                 if maybe_int:
-                    as_int = int(val)
+                    if thousands is None:
+                        as_int = int(val)
+                    else:
+                        as_int = int(val.replace(thousands, ""))
 
                     if as_int in na_values:
                         mask[i] = 1

diff --git a/pandas/_libs/src/parser/pd_parser.c b/pandas/_libs/src/parser/pd_parser.c
@@ -13,22 +13,22 @@ Distributed under the terms of the BSD Simplified License.
 #include "pandas/portable.h"
 
 static int to_double(char *item, double *p_value, char sci, char decimal,
-                     int *maybe_int) {
+                     char tsep, int *maybe_int) {
   char *p_end = NULL;
   int error = 0;
 
   /* Switch to precise xstrtod GH 31364 */
   *p_value =
-      precise_xstrtod(item, &p_end, decimal, sci, '\0', 1, &error, maybe_int);
+      precise_xstrtod(item, &p_end, decimal, sci, tsep, 1, &error, maybe_int);
 
   return (error == 0) && (!*p_end);
 }
 
-static int floatify(PyObject *str, double *result, int *maybe_int) {
+static int floatify(PyObject *str, double *result, int *maybe_int, char dec,
+                    char tsep) {
   char *data;
   PyObject *tmp = NULL;
   const char sci = 'E';
-  const char dec = '.';
 
   if (PyBytes_Check(str)) {
     data = PyBytes_AS_STRING(str);
@@ -43,7 +43,7 @@ static int floatify(PyObject *str, double *result, int *maybe_int) {
     return -1;
   }
 
-  const int status = to_double(data, result, sci, dec, maybe_int);
+  const int status = to_double(data, result, sci, dec, tsep, maybe_int);
 
   if (!status) {
     /* handle inf/-inf infinity/-infinity */

diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py
@@ -44,6 +44,8 @@ def to_numeric(
     errors: DateTimeErrorChoices = "raise",
     downcast: Literal["integer", "signed", "unsigned", "float"] | None = None,
     dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
+    thousands: str | None = None,
+    decimal: str = ".",
 ):
     """
     Convert argument to a numeric type.
@@ -99,6 +101,20 @@ def to_numeric(
 
         .. versionadded:: 2.0
 
+    thousands : str | None, default None
+        Character used to separate groups of thousands for readability,
+        e.g. ',' in 1,000,000
+        Must only be 1 character long.
+
+        .. versionadded:: 2.2.1
+
+    decimal : str, default '.'
+        Character used to separate decimal section from the integer
+        section of the number, e.g., '.' in 12.45
+        Must only be 1 character long.
+
+        .. versionadded:: 2.2.1
+
     Returns
     -------
     ret
@@ -155,6 +171,15 @@ def to_numeric(
     1    2.1
     2    3.0
     dtype: Float32
+
+    Handling of data with non standard decimal or thousand separators
+
+    >>> s = pd.Series(["1,5", "2.000.000", -3])
+    >>> pd.to_numeric(s, thousands=".", decimal=",")
+    0   1.5
+    1   2000000.0
+    2   -3.0
+    dtype: float64
     """
     if downcast not in (None, "integer", "signed", "unsigned", "float"):
         raise ValueError("invalid downcasting method provided")
@@ -203,6 +228,7 @@ def to_numeric(
         mask = values.isna()
         values = values.dropna().to_numpy()
     new_mask: np.ndarray | None = None
+
     if is_numeric_dtype(values_dtype):
         pass
     elif lib.is_np_dtype(values_dtype, "mM"):
@@ -217,6 +243,8 @@ def to_numeric(
             convert_to_masked_nullable=dtype_backend is not lib.no_default
             or isinstance(values_dtype, StringDtype)
             and not values_dtype.storage == "pyarrow_numpy",
+            thousands=thousands,
+            decimal=decimal,
         )
 
     if new_mask is not None:

diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py
@@ -904,3 +904,49 @@ def test_coerce_pyarrow_backend():
     result = to_numeric(ser, errors="coerce", dtype_backend="pyarrow")
     expected = Series([1, 2, None], dtype=ArrowDtype(pa.int64()))
     tm.assert_series_equal(result, expected)
+
+
+def test_custom_decimals():
+    # GH 4674
+    ser = Series(["1,5", "20,005", -3])
+    result = to_numeric(ser, decimal=",")
+    expected = Series([1.5, 20.005, -3])
+    tm.assert_series_equal(result, expected)
+
+
+def test_custom_thousands():
+    # GH 4674
+    ser = Series(["1,001", "2,000,000", -3])
+    result = to_numeric(ser, thousands=",")
+    expected = Series([1001, 2000000, -3])
+    tm.assert_series_equal(result, expected)
+
+
+def test_custom_thousands_and_decimals():
+    # GH 4674
+    ser = Series(["1.000,0", "2.000.000,5", "2,5", "3"])
+    result = to_numeric(ser, decimal=",", thousands=".")
+    expected = Series([1000.0, 2000000.5, 2.5, 3])
+    tm.assert_series_equal(result, expected)
+
+
+def test_separator_validation():
+    # GH 4674
+    ser = Series(["1", "2", "3"])
+    with pytest.raises(
+        ValueError, match="Decimal and thousand separators must not be the same"
+    ):
+        to_numeric(ser, thousands=".")
+
+    with pytest.raises(
+        ValueError, match="Decimal and thousand separators must not be the same"
+    ):
+        to_numeric(ser, thousands=",", decimal=",")
+
+    with pytest.raises(
+        ValueError, match="Thousands separator must not exceed length 1"
+    ):
+        to_numeric(ser, thousands="test")
+
+    with pytest.raises(ValueError, match="Decimal separator must have length 1"):
+        to_numeric(ser, decimal="test")