From 513a8bf606deda108b04c536ef14c40647283095 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Sun, 2 Dec 2018 18:19:49 +0000 Subject: [PATCH 1/3] Stop ignoring fixed rst files in flake8-rst --- setup.cfg | 6 ------ 1 file changed, 6 deletions(-) diff --git a/setup.cfg b/setup.cfg index cc7393e5a09b9..2966e8b92ad39 100644 --- a/setup.cfg +++ b/setup.cfg @@ -69,10 +69,6 @@ exclude = doc/source/advanced.rst doc/source/basics.rst doc/source/categorical.rst - doc/source/comparison_with_r.rst - doc/source/comparison_with_sql.rst - doc/source/comparison_with_stata.rst - doc/source/computation.rst doc/source/contributing.rst doc/source/contributing_docstring.rst doc/source/dsintro.rst @@ -80,14 +76,12 @@ exclude = doc/source/extending.rst doc/source/groupby.rst doc/source/indexing.rst - doc/source/io.rst doc/source/merging.rst doc/source/missing_data.rst doc/source/options.rst doc/source/release.rst doc/source/reshaping.rst doc/source/timedeltas.rst - doc/source/timeseries.rst doc/source/visualization.rst From 563007bb4a72030764d892a9c41449cc9f5b9d07 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Sun, 2 Dec 2018 20:26:22 +0000 Subject: [PATCH 2/3] Fixing remaining flake8 issues in io.rst, and removing timedeltas.rst from validation --- doc/source/io.rst | 230 ++++++++++++++++++++++------------------------ setup.cfg | 1 - 2 files changed, 111 insertions(+), 120 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 372a7b8a325e7..d996b1b5ce5b3 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1151,7 +1151,7 @@ Let us consider some examples: .. code-block:: python - read_csv(path, na_values=[5]) + pd.read_csv('path_to_file.csv', na_values=[5]) In the example above ``5`` and ``5.0`` will be recognized as ``NaN``, in addition to the defaults. A string will first be interpreted as a numerical @@ -1159,19 +1159,19 @@ addition to the defaults. A string will first be interpreted as a numerical .. code-block:: python - read_csv(path, keep_default_na=False, na_values=[""]) + pd.read_csv('path_to_file.csv', keep_default_na=False, na_values=[""]) Above, only an empty field will be recognized as ``NaN``. .. code-block:: python - read_csv(path, keep_default_na=False, na_values=["NA", "0"]) + pd.read_csv('path_to_file.csv', keep_default_na=False, na_values=["NA", "0"]) Above, both ``NA`` and ``0`` as strings are ``NaN``. .. code-block:: python - read_csv(path, na_values=["Nope"]) + pd.read_csv('path_to_file.csv', na_values=["Nope"]) The default values, in addition to the string ``"Nope"`` are recognized as ``NaN``. @@ -1245,24 +1245,13 @@ too few fields will have NA values filled in the trailing fields. Lines with too many fields will raise an error by default: .. ipython:: python - :suppress: + :okexcept: data = ('a,b,c\n' '1,2,3\n' '4,5,6,7\n' '8,9,10') - -.. code-block:: ipython - - In [27]: data = ('a,b,c\n' - '1,2,3\n' - '4,5,6,7\n' - '8,9,10') - - In [28]: pd.read_csv(StringIO(data)) - --------------------------------------------------------------------------- - ParserError Traceback (most recent call last) - ParserError: Error tokenizing data. C error: Expected 3 fields in line 3, saw 4 + pd.read_csv(StringIO(data)) You can elect to skip bad lines: @@ -2754,7 +2743,7 @@ file, and the ``sheet_name`` indicating which sheet to parse. .. code-block:: python # Returns a DataFrame - read_excel('path_to_file.xls', sheet_name='Sheet1') + pd.read_excel('path_to_file.xls', sheet_name='Sheet1') .. _io.excel.excelfile_class: @@ -2803,14 +2792,14 @@ of sheet names can simply be passed to ``read_excel`` with no loss in performanc # using the ExcelFile class data = {} with pd.ExcelFile('path_to_file.xls') as xls: - data['Sheet1'] = read_excel(xls, 'Sheet1', index_col=None, - na_values=['NA']) - data['Sheet2'] = read_excel(xls, 'Sheet2', index_col=None, - na_values=['NA']) + data['Sheet1'] = pd.read_excel(xls, 'Sheet1', index_col=None, + na_values=['NA']) + data['Sheet2'] = pd.read_excel(xls, 'Sheet2', index_col=None, + na_values=['NA']) # equivalent using the read_excel function - data = read_excel('path_to_file.xls', ['Sheet1', 'Sheet2'], - index_col=None, na_values=['NA']) + data = pd.read_excel('path_to_file.xls', ['Sheet1', 'Sheet2'], + index_col=None, na_values=['NA']) .. _io.excel.specifying_sheets: @@ -2832,35 +2821,35 @@ Specifying Sheets .. code-block:: python # Returns a DataFrame - read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA']) + pd.read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA']) Using the sheet index: .. code-block:: python # Returns a DataFrame - read_excel('path_to_file.xls', 0, index_col=None, na_values=['NA']) + pd.read_excel('path_to_file.xls', 0, index_col=None, na_values=['NA']) Using all default values: .. code-block:: python # Returns a DataFrame - read_excel('path_to_file.xls') + pd.read_excel('path_to_file.xls') Using None to get all sheets: .. code-block:: python # Returns a dictionary of DataFrames - read_excel('path_to_file.xls', sheet_name=None) + pd.read_excel('path_to_file.xls', sheet_name=None) Using a list to get multiple sheets: .. code-block:: python # Returns the 1st and 4th sheet, as a dictionary of DataFrames. - read_excel('path_to_file.xls', sheet_name=['Sheet1', 3]) + pd.read_excel('path_to_file.xls', sheet_name=['Sheet1', 3]) ``read_excel`` can read more than one sheet, by setting ``sheet_name`` to either a list of sheet names, a list of sheet positions, or ``None`` to read all sheets. @@ -2932,20 +2921,20 @@ to be parsed. .. code-block:: python - read_excel('path_to_file.xls', 'Sheet1', usecols=2) + pd.read_excel('path_to_file.xls', 'Sheet1', usecols=2) You can also specify a comma-delimited set of Excel columns and ranges as a string: .. code-block:: python - read_excel('path_to_file.xls', 'Sheet1', usecols='A,C:E') + pd.read_excel('path_to_file.xls', 'Sheet1', usecols='A,C:E') If ``usecols`` is a list of integers, then it is assumed to be the file column indices to be parsed. .. code-block:: python - read_excel('path_to_file.xls', 'Sheet1', usecols=[0, 2, 3]) + pd.read_excel('path_to_file.xls', 'Sheet1', usecols=[0, 2, 3]) Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. @@ -2957,7 +2946,7 @@ document header row(s). Those strings define which columns will be parsed: .. code-block:: python - read_excel('path_to_file.xls', 'Sheet1', usecols=['foo', 'bar']) + pd.read_excel('path_to_file.xls', 'Sheet1', usecols=['foo', 'bar']) Element order is ignored, so ``usecols=['baz', 'joe']`` is the same as ``['joe', 'baz']``. @@ -2968,7 +2957,7 @@ the column names, returning names where the callable function evaluates to ``Tru .. code-block:: python - read_excel('path_to_file.xls', 'Sheet1', usecols=lambda x: x.isalpha()) + pd.read_excel('path_to_file.xls', 'Sheet1', usecols=lambda x: x.isalpha()) Parsing Dates +++++++++++++ @@ -2980,7 +2969,7 @@ use the ``parse_dates`` keyword to parse those strings to datetimes: .. code-block:: python - read_excel('path_to_file.xls', 'Sheet1', parse_dates=['date_strings']) + pd.read_excel('path_to_file.xls', 'Sheet1', parse_dates=['date_strings']) Cell Converters @@ -2991,7 +2980,7 @@ option. For instance, to convert a column to boolean: .. code-block:: python - read_excel('path_to_file.xls', 'Sheet1', converters={'MyBools': bool}) + pd.read_excel('path_to_file.xls', 'Sheet1', converters={'MyBools': bool}) This options handles missing values and treats exceptions in the converters as missing data. Transformations are applied cell by cell rather than to the @@ -3006,7 +2995,7 @@ missing data to recover integer dtype: return int(x) if x else -1 - read_excel('path_to_file.xls', 'Sheet1', converters={'MyInts': cfun}) + pd.read_excel('path_to_file.xls', 'Sheet1', converters={'MyInts': cfun}) dtype Specifications ++++++++++++++++++++ @@ -3020,7 +3009,7 @@ no type inference, use the type ``str`` or ``object``. .. code-block:: python - read_excel('path_to_file.xls', dtype={'MyInts': 'int64', 'MyText': str}) + pd.read_excel('path_to_file.xls', dtype={'MyInts': 'int64', 'MyText': str}) .. _io.excel_writer: @@ -5126,7 +5115,7 @@ If you have an SQLAlchemy description of your database you can express where con sa.Column('Col_3', sa.Boolean), ) - pd.read_sql(sa.select([data_table]).where(data_table.c.Col_3 == True), engine) + pd.read_sql(sa.select([data_table]).where(data_table.c.Col_3 is True), engine) You can combine SQLAlchemy expressions with parameters passed to :func:`read_sql` using :func:`sqlalchemy.bindparam` @@ -5155,7 +5144,7 @@ And then issue the following queries: .. code-block:: python - data.to_sql('data', cnx) + data.to_sql('data', con) pd.read_sql_query("SELECT * FROM data", con) @@ -5372,6 +5361,9 @@ Obtain an iterator and read an XPORT file 100,000 lines at a time: .. code-block:: python + def do_something(chunk): + pass + rdr = pd.read_sas('sas_xport.xpt', chunk=100000) for chunk in rdr: do_something(chunk) @@ -5424,85 +5416,7 @@ ignored. dtypes: float64(1), int64(1) memory usage: 15.3 MB -When writing, the top-three functions in terms of speed are are -``test_pickle_write``, ``test_feather_write`` and ``test_hdf_fixed_write_compress``. - -.. code-block:: ipython - - In [14]: %timeit test_sql_write(df) - 2.37 s ± 36.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - - In [15]: %timeit test_hdf_fixed_write(df) - 194 ms ± 65.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) - - In [26]: %timeit test_hdf_fixed_write_compress(df) - 119 ms ± 2.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) - - In [16]: %timeit test_hdf_table_write(df) - 623 ms ± 125 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - - In [27]: %timeit test_hdf_table_write_compress(df) - 563 ms ± 23.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - - In [17]: %timeit test_csv_write(df) - 3.13 s ± 49.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - - In [30]: %timeit test_feather_write(df) - 103 ms ± 5.88 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) - - In [31]: %timeit test_pickle_write(df) - 109 ms ± 3.72 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) - - In [32]: %timeit test_pickle_write_compress(df) - 3.33 s ± 55.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - -When reading, the top three are ``test_feather_read``, ``test_pickle_read`` and -``test_hdf_fixed_read``. - -.. code-block:: ipython - - In [18]: %timeit test_sql_read() - 1.35 s ± 14.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - - In [19]: %timeit test_hdf_fixed_read() - 14.3 ms ± 438 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) - - In [28]: %timeit test_hdf_fixed_read_compress() - 23.5 ms ± 672 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) - - In [20]: %timeit test_hdf_table_read() - 35.4 ms ± 314 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) - - In [29]: %timeit test_hdf_table_read_compress() - 42.6 ms ± 2.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) - - In [22]: %timeit test_csv_read() - 516 ms ± 27.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - - In [33]: %timeit test_feather_read() - 4.06 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) - - In [34]: %timeit test_pickle_read() - 6.5 ms ± 172 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) - - In [35]: %timeit test_pickle_read_compress() - 588 ms ± 3.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - -Space on disk (in bytes) - -.. code-block:: none - - 34816000 Aug 21 18:00 test.sql - 24009240 Aug 21 18:00 test_fixed.hdf - 7919610 Aug 21 18:00 test_fixed_compress.hdf - 24458892 Aug 21 18:00 test_table.hdf - 8657116 Aug 21 18:00 test_table_compress.hdf - 28520770 Aug 21 18:00 test.csv - 16000248 Aug 21 18:00 test.feather - 16000848 Aug 21 18:00 test.pkl - 7554108 Aug 21 18:00 test.pkl.compress - -And here's the code: +Given the next test set: .. code-block:: python @@ -5589,3 +5503,81 @@ And here's the code: def test_pickle_read_compress(): pd.read_pickle('test.pkl.compress', compression='xz') + +When writing, the top-three functions in terms of speed are are +``test_pickle_write``, ``test_feather_write`` and ``test_hdf_fixed_write_compress``. + +.. code-block:: ipython + + In [14]: %timeit test_sql_write(df) + 2.37 s ± 36.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + + In [15]: %timeit test_hdf_fixed_write(df) + 194 ms ± 65.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + + In [26]: %timeit test_hdf_fixed_write_compress(df) + 119 ms ± 2.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + + In [16]: %timeit test_hdf_table_write(df) + 623 ms ± 125 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + + In [27]: %timeit test_hdf_table_write_compress(df) + 563 ms ± 23.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + + In [17]: %timeit test_csv_write(df) + 3.13 s ± 49.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + + In [30]: %timeit test_feather_write(df) + 103 ms ± 5.88 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + + In [31]: %timeit test_pickle_write(df) + 109 ms ± 3.72 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + + In [32]: %timeit test_pickle_write_compress(df) + 3.33 s ± 55.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + +When reading, the top three are ``test_feather_read``, ``test_pickle_read`` and +``test_hdf_fixed_read``. + +.. code-block:: ipython + + In [18]: %timeit test_sql_read() + 1.35 s ± 14.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + + In [19]: %timeit test_hdf_fixed_read() + 14.3 ms ± 438 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) + + In [28]: %timeit test_hdf_fixed_read_compress() + 23.5 ms ± 672 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) + + In [20]: %timeit test_hdf_table_read() + 35.4 ms ± 314 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) + + In [29]: %timeit test_hdf_table_read_compress() + 42.6 ms ± 2.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + + In [22]: %timeit test_csv_read() + 516 ms ± 27.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + + In [33]: %timeit test_feather_read() + 4.06 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) + + In [34]: %timeit test_pickle_read() + 6.5 ms ± 172 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) + + In [35]: %timeit test_pickle_read_compress() + 588 ms ± 3.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + +Space on disk (in bytes) + +.. code-block:: none + + 34816000 Aug 21 18:00 test.sql + 24009240 Aug 21 18:00 test_fixed.hdf + 7919610 Aug 21 18:00 test_fixed_compress.hdf + 24458892 Aug 21 18:00 test_table.hdf + 8657116 Aug 21 18:00 test_table_compress.hdf + 28520770 Aug 21 18:00 test.csv + 16000248 Aug 21 18:00 test.feather + 16000848 Aug 21 18:00 test.pkl + 7554108 Aug 21 18:00 test.pkl.compress diff --git a/setup.cfg b/setup.cfg index 2966e8b92ad39..7909d3ef7f2d7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -81,7 +81,6 @@ exclude = doc/source/options.rst doc/source/release.rst doc/source/reshaping.rst - doc/source/timedeltas.rst doc/source/visualization.rst From 2f6b576593b0af1089f45c0b4461b533879c4653 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Sun, 2 Dec 2018 22:36:32 +0000 Subject: [PATCH 3/3] Fixing typo that generated the wrong exception when parsing a csv --- doc/source/io.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index d996b1b5ce5b3..fbd238586c776 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1245,7 +1245,7 @@ too few fields will have NA values filled in the trailing fields. Lines with too many fields will raise an error by default: .. ipython:: python - :okexcept: + :okexcept: data = ('a,b,c\n' '1,2,3\n'