|
5 | 5 | import numpy as np
|
6 | 6 | import warnings
|
7 | 7 |
|
| 8 | +import pandas as pd |
8 | 9 | from pandas._libs import tslib, lib
|
9 | 10 | from pandas._libs.tslib import iNaT
|
10 | 11 | from pandas.compat import string_types, text_type, PY3
|
|
18 | 19 | is_integer_dtype,
|
19 | 20 | is_datetime_or_timedelta_dtype,
|
20 | 21 | is_bool_dtype, is_scalar,
|
| 22 | + is_numeric_dtype, is_decimal, |
| 23 | + is_number, |
21 | 24 | _string_dtypes,
|
22 | 25 | _coerce_to_dtype,
|
23 | 26 | _ensure_int8, _ensure_int16,
|
24 | 27 | _ensure_int32, _ensure_int64,
|
25 | 28 | _NS_DTYPE, _TD_DTYPE, _INT64_DTYPE,
|
26 | 29 | _POSSIBLY_CAST_DTYPES)
|
27 | 30 | from .dtypes import ExtensionDtype, DatetimeTZDtype, PeriodDtype
|
28 |
| -from .generic import ABCDatetimeIndex, ABCPeriodIndex, ABCSeries |
| 31 | +from .generic import (ABCDatetimeIndex, ABCPeriodIndex, |
| 32 | + ABCSeries, ABCIndexClass) |
29 | 33 | from .missing import isnull, notnull
|
30 | 34 | from .inference import is_list_like
|
31 | 35 |
|
@@ -1025,3 +1029,161 @@ def find_common_type(types):
|
1025 | 1029 | return np.object
|
1026 | 1030 |
|
1027 | 1031 | return np.find_common_type(types, [])
|
| 1032 | + |
| 1033 | + |
| 1034 | +def to_numeric(arg, errors='raise', downcast=None): |
| 1035 | + """ |
| 1036 | + Convert argument to a numeric type. |
| 1037 | +
|
| 1038 | + Parameters |
| 1039 | + ---------- |
| 1040 | + arg : list, tuple, 1-d array, or Series |
| 1041 | + errors : {'ignore', 'raise', 'coerce'}, default 'raise' |
| 1042 | + - If 'raise', then invalid parsing will raise an exception |
| 1043 | + - If 'coerce', then invalid parsing will be set as NaN |
| 1044 | + - If 'ignore', then invalid parsing will return the input |
| 1045 | + downcast : {'integer', 'signed', 'unsigned', 'float'} , default None |
| 1046 | + If not None, and if the data has been successfully cast to a |
| 1047 | + numerical dtype (or if the data was numeric to begin with), |
| 1048 | + downcast that resulting data to the smallest numerical dtype |
| 1049 | + possible according to the following rules: |
| 1050 | +
|
| 1051 | + - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) |
| 1052 | + - 'unsigned': smallest unsigned int dtype (min.: np.uint8) |
| 1053 | + - 'float': smallest float dtype (min.: np.float32) |
| 1054 | +
|
| 1055 | + As this behaviour is separate from the core conversion to |
| 1056 | + numeric values, any errors raised during the downcasting |
| 1057 | + will be surfaced regardless of the value of the 'errors' input. |
| 1058 | +
|
| 1059 | + In addition, downcasting will only occur if the size |
| 1060 | + of the resulting data's dtype is strictly larger than |
| 1061 | + the dtype it is to be cast to, so if none of the dtypes |
| 1062 | + checked satisfy that specification, no downcasting will be |
| 1063 | + performed on the data. |
| 1064 | +
|
| 1065 | + .. versionadded:: 0.19.0 |
| 1066 | +
|
| 1067 | + Returns |
| 1068 | + ------- |
| 1069 | + ret : numeric if parsing succeeded. |
| 1070 | + Return type depends on input. Series if Series, otherwise ndarray |
| 1071 | +
|
| 1072 | + Examples |
| 1073 | + -------- |
| 1074 | + Take separate series and convert to numeric, coercing when told to |
| 1075 | +
|
| 1076 | + >>> import pandas as pd |
| 1077 | + >>> s = pd.Series(['1.0', '2', -3]) |
| 1078 | + >>> pd.to_numeric(s) |
| 1079 | + 0 1.0 |
| 1080 | + 1 2.0 |
| 1081 | + 2 -3.0 |
| 1082 | + dtype: float64 |
| 1083 | + >>> pd.to_numeric(s, downcast='float') |
| 1084 | + 0 1.0 |
| 1085 | + 1 2.0 |
| 1086 | + 2 -3.0 |
| 1087 | + dtype: float32 |
| 1088 | + >>> pd.to_numeric(s, downcast='signed') |
| 1089 | + 0 1 |
| 1090 | + 1 2 |
| 1091 | + 2 -3 |
| 1092 | + dtype: int8 |
| 1093 | + >>> s = pd.Series(['apple', '1.0', '2', -3]) |
| 1094 | + >>> pd.to_numeric(s, errors='ignore') |
| 1095 | + 0 apple |
| 1096 | + 1 1.0 |
| 1097 | + 2 2 |
| 1098 | + 3 -3 |
| 1099 | + dtype: object |
| 1100 | + >>> pd.to_numeric(s, errors='coerce') |
| 1101 | + 0 NaN |
| 1102 | + 1 1.0 |
| 1103 | + 2 2.0 |
| 1104 | + 3 -3.0 |
| 1105 | + dtype: float64 |
| 1106 | + """ |
| 1107 | + if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'): |
| 1108 | + raise ValueError('invalid downcasting method provided') |
| 1109 | + |
| 1110 | + is_series = False |
| 1111 | + is_index = False |
| 1112 | + is_scalars = False |
| 1113 | + |
| 1114 | + if isinstance(arg, ABCSeries): |
| 1115 | + is_series = True |
| 1116 | + values = arg.values |
| 1117 | + elif isinstance(arg, ABCIndexClass): |
| 1118 | + is_index = True |
| 1119 | + values = arg.asi8 |
| 1120 | + if values is None: |
| 1121 | + values = arg.values |
| 1122 | + elif isinstance(arg, (list, tuple)): |
| 1123 | + values = np.array(arg, dtype='O') |
| 1124 | + elif is_scalar(arg): |
| 1125 | + if is_decimal(arg): |
| 1126 | + return float(arg) |
| 1127 | + if is_number(arg): |
| 1128 | + return arg |
| 1129 | + is_scalars = True |
| 1130 | + values = np.array([arg], dtype='O') |
| 1131 | + elif getattr(arg, 'ndim', 1) > 1: |
| 1132 | + raise TypeError('arg must be a list, tuple, 1-d array, or Series') |
| 1133 | + else: |
| 1134 | + values = arg |
| 1135 | + |
| 1136 | + try: |
| 1137 | + if is_numeric_dtype(values): |
| 1138 | + pass |
| 1139 | + elif is_datetime_or_timedelta_dtype(values): |
| 1140 | + values = values.astype(np.int64) |
| 1141 | + else: |
| 1142 | + values = _ensure_object(values) |
| 1143 | + coerce_numeric = False if errors in ('ignore', 'raise') else True |
| 1144 | + values = lib.maybe_convert_numeric(values, set(), |
| 1145 | + coerce_numeric=coerce_numeric) |
| 1146 | + |
| 1147 | + except Exception: |
| 1148 | + if errors == 'raise': |
| 1149 | + raise |
| 1150 | + |
| 1151 | + # attempt downcast only if the data has been successfully converted |
| 1152 | + # to a numerical dtype and if a downcast method has been specified |
| 1153 | + if downcast is not None and is_numeric_dtype(values): |
| 1154 | + typecodes = None |
| 1155 | + |
| 1156 | + if downcast in ('integer', 'signed'): |
| 1157 | + typecodes = np.typecodes['Integer'] |
| 1158 | + elif downcast == 'unsigned' and np.min(values) >= 0: |
| 1159 | + typecodes = np.typecodes['UnsignedInteger'] |
| 1160 | + elif downcast == 'float': |
| 1161 | + typecodes = np.typecodes['Float'] |
| 1162 | + |
| 1163 | + # pandas support goes only to np.float32, |
| 1164 | + # as float dtypes smaller than that are |
| 1165 | + # extremely rare and not well supported |
| 1166 | + float_32_char = np.dtype(np.float32).char |
| 1167 | + float_32_ind = typecodes.index(float_32_char) |
| 1168 | + typecodes = typecodes[float_32_ind:] |
| 1169 | + |
| 1170 | + if typecodes is not None: |
| 1171 | + # from smallest to largest |
| 1172 | + for dtype in typecodes: |
| 1173 | + if np.dtype(dtype).itemsize <= values.dtype.itemsize: |
| 1174 | + values = maybe_downcast_to_dtype(values, dtype) |
| 1175 | + |
| 1176 | + # successful conversion |
| 1177 | + if values.dtype == dtype: |
| 1178 | + break |
| 1179 | + |
| 1180 | + if is_series: |
| 1181 | + return pd.Series(values, index=arg.index, name=arg.name) |
| 1182 | + elif is_index: |
| 1183 | + # because we want to coerce to numeric if possible, |
| 1184 | + # do not use _shallow_copy_with_infer |
| 1185 | + return pd.Index(values, name=arg.name) |
| 1186 | + elif is_scalars: |
| 1187 | + return values[0] |
| 1188 | + else: |
| 1189 | + return values |
0 commit comments