From 15392d7e3aa5c9d4d26a008a6dbc09b6bce54e5c Mon Sep 17 00:00:00 2001 From: Avinash Pancham Date: Wed, 30 Dec 2020 15:43:56 +0100 Subject: [PATCH 1/2] CLN: Add typing for dtype argument in io directory (GH38808) --- pandas/io/excel/_base.py | 6 +++--- pandas/io/json/_json.py | 8 +++++--- pandas/io/parsers.py | 13 ++++++++----- pandas/io/pytables.py | 11 +++++++++-- 4 files changed, 25 insertions(+), 13 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 221e8b9ccfb14..d54426a437843 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -12,7 +12,7 @@ from pandas._config import config from pandas._libs.parsers import STR_NA_VALUES -from pandas._typing import Buffer, FilePathOrBuffer, StorageOptions +from pandas._typing import Buffer, DtypeArg, FilePathOrBuffer, StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.errors import EmptyDataError from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments, doc @@ -309,7 +309,7 @@ def read_excel( index_col=None, usecols=None, squeeze=False, - dtype=None, + dtype: Optional[DtypeArg] = None, engine=None, converters=None, true_values=None, @@ -433,7 +433,7 @@ def parse( index_col=None, usecols=None, squeeze=False, - dtype=None, + dtype: Optional[DtypeArg] = None, true_values=None, false_values=None, skiprows=None, diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index e1ac7b1b02f21..dd1c012252683 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -11,6 +11,8 @@ from pandas._libs.tslibs import iNaT from pandas._typing import ( CompressionOptions, + DtypeArg, + FrameOrSeriesUnion, IndexLabel, JSONSerializable, StorageOptions, @@ -296,7 +298,7 @@ def read_json( path_or_buf=None, orient=None, typ="frame", - dtype=None, + dtype: Optional[DtypeArg] = None, convert_axes=None, convert_dates=True, keep_default_dates: bool = True, @@ -775,7 +777,7 @@ def __init__( self, json, orient, - dtype=None, + dtype: Optional[DtypeArg] = None, convert_axes=True, convert_dates=True, keep_default_dates=False, @@ -809,7 +811,7 @@ def __init__( self.convert_dates = convert_dates self.date_unit = date_unit self.keep_default_dates = keep_default_dates - self.obj = None + self.obj: Optional[FrameOrSeriesUnion] = None def check_keys_split(self, decoded): """ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index d670821c98520..be228bf9552a5 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -31,7 +31,7 @@ import pandas._libs.parsers as parsers from pandas._libs.parsers import STR_NA_VALUES from pandas._libs.tslibs import parsing -from pandas._typing import FilePathOrBuffer, StorageOptions, Union +from pandas._typing import DtypeArg, FilePathOrBuffer, StorageOptions, Union from pandas.errors import ( AbstractMethodError, EmptyDataError, @@ -546,7 +546,7 @@ def read_csv( prefix=None, mangle_dupe_cols=True, # General Parsing Configuration - dtype=None, + dtype: Optional[DtypeArg] = None, engine=None, converters=None, true_values=None, @@ -626,7 +626,7 @@ def read_table( prefix=None, mangle_dupe_cols=True, # General Parsing Configuration - dtype=None, + dtype: Optional[DtypeArg] = None, engine=None, converters=None, true_values=None, @@ -3502,20 +3502,23 @@ def _clean_index_names(columns, index_col, unnamed_cols): return index_names, columns, index_col -def _get_empty_meta(columns, index_col, index_names, dtype=None): +def _get_empty_meta(columns, index_col, index_names, dtype: Optional[DtypeArg] = None): columns = list(columns) # Convert `dtype` to a defaultdict of some kind. # This will enable us to write `dtype[col_name]` # without worrying about KeyError issues later on. - if not isinstance(dtype, dict): + if not is_dict_like(dtype): # if dtype == None, default will be object. default_dtype = dtype or object dtype = defaultdict(lambda: default_dtype) else: # Save a copy of the dictionary. + dtype = cast(dict, dtype) _dtype = dtype.copy() + dtype = defaultdict(lambda: object) + dtype = cast(dict, dtype) # Convert column indexes to column names. for k, v in _dtype.items(): diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index d5c5e8edb9efe..341a8a9f90b96 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -29,7 +29,14 @@ from pandas._libs import lib, writers as libwriters from pandas._libs.tslibs import timezones -from pandas._typing import ArrayLike, FrameOrSeries, FrameOrSeriesUnion, Label, Shape +from pandas._typing import ( + ArrayLike, + DtypeArg, + FrameOrSeries, + FrameOrSeriesUnion, + Label, + Shape, +) from pandas.compat._optional import import_optional_dependency from pandas.compat.pickle_compat import patch_pickle from pandas.errors import PerformanceWarning @@ -2259,7 +2266,7 @@ def __init__( table=None, meta=None, metadata=None, - dtype=None, + dtype: Optional[DtypeArg] = None, data=None, ): super().__init__( From fc77159948183cccb6ecb7a1d806f7d76b03a945 Mon Sep 17 00:00:00 2001 From: Avinash Pancham Date: Wed, 30 Dec 2020 17:40:28 +0100 Subject: [PATCH 2/2] Update code logic --- pandas/io/parsers.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index be228bf9552a5..68c0bbf0787e6 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -3513,17 +3513,11 @@ def _get_empty_meta(columns, index_col, index_names, dtype: Optional[DtypeArg] = default_dtype = dtype or object dtype = defaultdict(lambda: default_dtype) else: - # Save a copy of the dictionary. dtype = cast(dict, dtype) - _dtype = dtype.copy() - - dtype = defaultdict(lambda: object) - dtype = cast(dict, dtype) - - # Convert column indexes to column names. - for k, v in _dtype.items(): - col = columns[k] if is_integer(k) else k - dtype[col] = v + dtype = defaultdict( + lambda: object, + {columns[k] if is_integer(k) else k: v for k, v in dtype.items()}, + ) # Even though we have no data, the "index" of the empty DataFrame # could for example still be an empty MultiIndex. Thus, we need to