diff --git a/pandas/tests/io/parser/multithread.py b/pandas/tests/io/parser/multithread.py deleted file mode 100644 index 0be3a429f5f64..0000000000000 --- a/pandas/tests/io/parser/multithread.py +++ /dev/null @@ -1,101 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Tests multithreading behaviour for reading and -parsing files for each parser defined in parsers.py -""" - -from __future__ import division - -from multiprocessing.pool import ThreadPool - -import numpy as np - -from pandas.compat import BytesIO, range - -import pandas as pd -from pandas import DataFrame -import pandas.util.testing as tm - - -def _construct_dataframe(num_rows): - - df = DataFrame(np.random.rand(num_rows, 5), columns=list('abcde')) - df['foo'] = 'foo' - df['bar'] = 'bar' - df['baz'] = 'baz' - df['date'] = pd.date_range('20000101 09:00:00', - periods=num_rows, - freq='s') - df['int'] = np.arange(num_rows, dtype='int64') - return df - - -class MultithreadTests(object): - - def _generate_multithread_dataframe(self, path, num_rows, num_tasks): - - def reader(arg): - start, nrows = arg - - if not start: - return self.read_csv(path, index_col=0, header=0, - nrows=nrows, parse_dates=['date']) - - return self.read_csv(path, - index_col=0, - header=None, - skiprows=int(start) + 1, - nrows=nrows, - parse_dates=[9]) - - tasks = [ - (num_rows * i // num_tasks, - num_rows // num_tasks) for i in range(num_tasks) - ] - - pool = ThreadPool(processes=num_tasks) - - results = pool.map(reader, tasks) - - header = results[0].columns - for r in results[1:]: - r.columns = header - - final_dataframe = pd.concat(results) - - return final_dataframe - - def test_multithread_stringio_read_csv(self): - # see gh-11786 - max_row_range = 10000 - num_files = 100 - - bytes_to_df = [ - '\n'.join( - ['%d,%d,%d' % (i, i, i) for i in range(max_row_range)] - ).encode() for j in range(num_files)] - files = [BytesIO(b) for b in bytes_to_df] - - # read all files in many threads - pool = ThreadPool(8) - results = pool.map(self.read_csv, files) - first_result = results[0] - - for result in results: - tm.assert_frame_equal(first_result, result) - - def test_multithread_path_multipart_read_csv(self): - # see gh-11786 - num_tasks = 4 - file_name = '__threadpool_reader__.csv' - num_rows = 100000 - - df = _construct_dataframe(num_rows) - - with tm.ensure_clean(file_name) as path: - df.to_csv(path) - - final_dataframe = self._generate_multithread_dataframe( - path, num_rows, num_tasks) - tm.assert_frame_equal(df, final_dataframe) diff --git a/pandas/tests/io/parser/na_values.py b/pandas/tests/io/parser/na_values.py deleted file mode 100644 index 5e67b62879acb..0000000000000 --- a/pandas/tests/io/parser/na_values.py +++ /dev/null @@ -1,399 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Tests that NA values are properly handled during -parsing for all of the parsers defined in parsers.py -""" - -import numpy as np -from numpy import nan -import pytest - -from pandas.compat import StringIO, range - -from pandas import DataFrame, Index, MultiIndex -import pandas.util.testing as tm - -import pandas.io.common as com - - -class NAvaluesTests(object): - - def test_string_nas(self): - data = """A,B,C -a,b,c -d,,f -,g,h -""" - result = self.read_csv(StringIO(data)) - expected = DataFrame([['a', 'b', 'c'], - ['d', np.nan, 'f'], - [np.nan, 'g', 'h']], - columns=['A', 'B', 'C']) - - tm.assert_frame_equal(result, expected) - - def test_detect_string_na(self): - data = """A,B -foo,bar -NA,baz -NaN,nan -""" - expected = np.array([['foo', 'bar'], [nan, 'baz'], [nan, nan]], - dtype=np.object_) - df = self.read_csv(StringIO(data)) - tm.assert_numpy_array_equal(df.values, expected) - - def test_non_string_na_values(self): - # see gh-3611: with an odd float format, we can't match - # the string '999.0' exactly but still need float matching - nice = """A,B --999,1.2 -2,-999 -3,4.5 -""" - ugly = """A,B --999,1.200 -2,-999.000 -3,4.500 -""" - na_values_param = [['-999.0', '-999'], - [-999, -999.0], - [-999.0, -999], - ['-999.0'], ['-999'], - [-999.0], [-999]] - expected = DataFrame([[np.nan, 1.2], [2.0, np.nan], - [3.0, 4.5]], columns=['A', 'B']) - - for data in (nice, ugly): - for na_values in na_values_param: - out = self.read_csv(StringIO(data), na_values=na_values) - tm.assert_frame_equal(out, expected) - - def test_default_na_values(self): - _NA_VALUES = {'-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A', - 'N/A', 'n/a', 'NA', '#NA', 'NULL', 'null', 'NaN', 'nan', - '-NaN', '-nan', '#N/A N/A', ''} - assert _NA_VALUES == com._NA_VALUES - nv = len(_NA_VALUES) - - def f(i, v): - if i == 0: - buf = '' - elif i > 0: - buf = ''.join([','] * i) - - buf = "{0}{1}".format(buf, v) - - if i < nv - 1: - buf = "{0}{1}".format(buf, ''.join([','] * (nv - i - 1))) - - return buf - - data = StringIO('\n'.join(f(i, v) for i, v in enumerate(_NA_VALUES))) - expected = DataFrame(np.nan, columns=range(nv), index=range(nv)) - df = self.read_csv(data, header=None) - tm.assert_frame_equal(df, expected) - - def test_custom_na_values(self): - data = """A,B,C -ignore,this,row -1,NA,3 --1.#IND,5,baz -7,8,NaN -""" - expected = np.array([[1., nan, 3], - [nan, 5, nan], - [7, 8, nan]]) - - df = self.read_csv(StringIO(data), na_values=['baz'], skiprows=[1]) - tm.assert_numpy_array_equal(df.values, expected) - - df2 = self.read_table(StringIO(data), sep=',', na_values=['baz'], - skiprows=[1]) - tm.assert_numpy_array_equal(df2.values, expected) - - df3 = self.read_table(StringIO(data), sep=',', na_values='baz', - skiprows=[1]) - tm.assert_numpy_array_equal(df3.values, expected) - - def test_bool_na_values(self): - data = """A,B,C -True,False,True -NA,True,False -False,NA,True""" - - result = self.read_csv(StringIO(data)) - expected = DataFrame({'A': np.array([True, nan, False], dtype=object), - 'B': np.array([False, True, nan], dtype=object), - 'C': [True, False, True]}) - - tm.assert_frame_equal(result, expected) - - def test_na_value_dict(self): - data = """A,B,C -foo,bar,NA -bar,foo,foo -foo,bar,NA -bar,foo,foo""" - - df = self.read_csv(StringIO(data), - na_values={'A': ['foo'], 'B': ['bar']}) - expected = DataFrame({'A': [np.nan, 'bar', np.nan, 'bar'], - 'B': [np.nan, 'foo', np.nan, 'foo'], - 'C': [np.nan, 'foo', np.nan, 'foo']}) - tm.assert_frame_equal(df, expected) - - data = """\ -a,b,c,d -0,NA,1,5 -""" - xp = DataFrame({'b': [np.nan], 'c': [1], 'd': [5]}, index=[0]) - xp.index.name = 'a' - df = self.read_csv(StringIO(data), na_values={}, index_col=0) - tm.assert_frame_equal(df, xp) - - xp = DataFrame({'b': [np.nan], 'd': [5]}, - MultiIndex.from_tuples([(0, 1)])) - xp.index.names = ['a', 'c'] - df = self.read_csv(StringIO(data), na_values={}, index_col=[0, 2]) - tm.assert_frame_equal(df, xp) - - xp = DataFrame({'b': [np.nan], 'd': [5]}, - MultiIndex.from_tuples([(0, 1)])) - xp.index.names = ['a', 'c'] - df = self.read_csv(StringIO(data), na_values={}, index_col=['a', 'c']) - tm.assert_frame_equal(df, xp) - - def test_na_values_keep_default(self): - data = """\ -One,Two,Three -a,1,one -b,2,two -,3,three -d,4,nan -e,5,five -nan,6, -g,7,seven -""" - df = self.read_csv(StringIO(data)) - xp = DataFrame({'One': ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'], - 'Two': [1, 2, 3, 4, 5, 6, 7], - 'Three': ['one', 'two', 'three', np.nan, 'five', - np.nan, 'seven']}) - tm.assert_frame_equal(xp.reindex(columns=df.columns), df) - - df = self.read_csv(StringIO(data), na_values={'One': [], 'Three': []}, - keep_default_na=False) - xp = DataFrame({'One': ['a', 'b', '', 'd', 'e', 'nan', 'g'], - 'Two': [1, 2, 3, 4, 5, 6, 7], - 'Three': ['one', 'two', 'three', 'nan', 'five', - '', 'seven']}) - tm.assert_frame_equal(xp.reindex(columns=df.columns), df) - - df = self.read_csv( - StringIO(data), na_values=['a'], keep_default_na=False) - xp = DataFrame({'One': [np.nan, 'b', '', 'd', 'e', 'nan', 'g'], - 'Two': [1, 2, 3, 4, 5, 6, 7], - 'Three': ['one', 'two', 'three', 'nan', 'five', '', - 'seven']}) - tm.assert_frame_equal(xp.reindex(columns=df.columns), df) - - df = self.read_csv(StringIO(data), na_values={'One': [], 'Three': []}) - xp = DataFrame({'One': ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'], - 'Two': [1, 2, 3, 4, 5, 6, 7], - 'Three': ['one', 'two', 'three', np.nan, 'five', - np.nan, 'seven']}) - tm.assert_frame_equal(xp.reindex(columns=df.columns), df) - - # see gh-4318: passing na_values=None and - # keep_default_na=False yields 'None' as a na_value - data = """\ -One,Two,Three -a,1,None -b,2,two -,3,None -d,4,nan -e,5,five -nan,6, -g,7,seven -""" - df = self.read_csv( - StringIO(data), keep_default_na=False) - xp = DataFrame({'One': ['a', 'b', '', 'd', 'e', 'nan', 'g'], - 'Two': [1, 2, 3, 4, 5, 6, 7], - 'Three': ['None', 'two', 'None', 'nan', 'five', '', - 'seven']}) - tm.assert_frame_equal(xp.reindex(columns=df.columns), df) - - def test_no_keep_default_na_dict_na_values(self): - # see gh-19227 - data = "a,b\n,2" - - df = self.read_csv(StringIO(data), na_values={"b": ["2"]}, - keep_default_na=False) - expected = DataFrame({"a": [""], "b": [np.nan]}) - tm.assert_frame_equal(df, expected) - - # Scalar values shouldn't cause the parsing to crash or fail. - data = "a,b\n1,2" - - df = self.read_csv(StringIO(data), na_values={"b": 2}, - keep_default_na=False) - expected = DataFrame({"a": [1], "b": [np.nan]}) - tm.assert_frame_equal(df, expected) - - data = """\ -113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008 -729639,"qwer","",asdfkj,466.681,,252.373 -""" - expected = DataFrame({0: [np.nan, 729639.0], - 1: [np.nan, "qwer"], - 2: ["/blaha", np.nan], - 3: ["kjsdkj", "asdfkj"], - 4: [412.166, 466.681], - 5: ["225.874", ""], - 6: [np.nan, 252.373]}) - - df = self.read_csv(StringIO(data), header=None, keep_default_na=False, - na_values={2: "", 6: "214.008", - 1: "blah", 0: 113125}) - tm.assert_frame_equal(df, expected) - - df = self.read_csv(StringIO(data), header=None, keep_default_na=False, - na_values={2: "", 6: "214.008", - 1: "blah", 0: "113125"}) - tm.assert_frame_equal(df, expected) - - def test_na_values_na_filter_override(self): - data = """\ -A,B -1,A -nan,B -3,C -""" - - expected = DataFrame([[1, 'A'], [np.nan, np.nan], [3, 'C']], - columns=['A', 'B']) - out = self.read_csv(StringIO(data), na_values=['B'], na_filter=True) - tm.assert_frame_equal(out, expected) - - expected = DataFrame([['1', 'A'], ['nan', 'B'], ['3', 'C']], - columns=['A', 'B']) - out = self.read_csv(StringIO(data), na_values=['B'], na_filter=False) - tm.assert_frame_equal(out, expected) - - def test_na_trailing_columns(self): - data = """Date,Currenncy,Symbol,Type,Units,UnitPrice,Cost,Tax -2012-03-14,USD,AAPL,BUY,1000 -2012-05-12,USD,SBUX,SELL,500""" - - result = self.read_csv(StringIO(data)) - assert result['Date'][1] == '2012-05-12' - assert result['UnitPrice'].isna().all() - - def test_na_values_scalar(self): - # see gh-12224 - names = ['a', 'b'] - data = '1,2\n2,1' - - expected = DataFrame([[np.nan, 2.0], [2.0, np.nan]], - columns=names) - out = self.read_csv(StringIO(data), names=names, na_values=1) - tm.assert_frame_equal(out, expected) - - expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], - columns=names) - out = self.read_csv(StringIO(data), names=names, - na_values={'a': 2, 'b': 1}) - tm.assert_frame_equal(out, expected) - - def test_na_values_dict_aliasing(self): - na_values = {'a': 2, 'b': 1} - na_values_copy = na_values.copy() - - names = ['a', 'b'] - data = '1,2\n2,1' - - expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names) - out = self.read_csv(StringIO(data), names=names, na_values=na_values) - - tm.assert_frame_equal(out, expected) - tm.assert_dict_equal(na_values, na_values_copy) - - def test_na_values_dict_col_index(self): - # see gh-14203 - - data = 'a\nfoo\n1' - na_values = {0: 'foo'} - - out = self.read_csv(StringIO(data), na_values=na_values) - expected = DataFrame({'a': [np.nan, 1]}) - tm.assert_frame_equal(out, expected) - - def test_na_values_uint64(self): - # see gh-14983 - - na_values = [2**63] - data = str(2**63) + '\n' + str(2**63 + 1) - expected = DataFrame([str(2**63), str(2**63 + 1)]) - out = self.read_csv(StringIO(data), header=None, na_values=na_values) - tm.assert_frame_equal(out, expected) - - data = str(2**63) + ',1' + '\n,2' - expected = DataFrame([[str(2**63), 1], ['', 2]]) - out = self.read_csv(StringIO(data), header=None) - tm.assert_frame_equal(out, expected) - - def test_empty_na_values_no_default_with_index(self): - # see gh-15835 - data = "a,1\nb,2" - - expected = DataFrame({'1': [2]}, index=Index(["b"], name="a")) - out = self.read_csv(StringIO(data), keep_default_na=False, index_col=0) - - tm.assert_frame_equal(out, expected) - - def test_no_na_filter_on_index(self): - # see gh-5239 - data = "a,b,c\n1,,3\n4,5,6" - - # Don't parse NA-values in index when na_filter=False. - out = self.read_csv(StringIO(data), index_col=[1], na_filter=False) - - expected = DataFrame({"a": [1, 4], "c": [3, 6]}, - index=Index(["", "5"], name="b")) - tm.assert_frame_equal(out, expected) - - # Parse NA-values in index when na_filter=True. - out = self.read_csv(StringIO(data), index_col=[1], na_filter=True) - - expected = DataFrame({"a": [1, 4], "c": [3, 6]}, - index=Index([np.nan, 5.0], name="b")) - tm.assert_frame_equal(out, expected) - - def test_inf_na_values_with_int_index(self): - # see gh-17128 - data = "idx,col1,col2\n1,3,4\n2,inf,-inf" - - # Don't fail with OverflowError with infs and integer index column - out = self.read_csv(StringIO(data), index_col=[0], - na_values=['inf', '-inf']) - expected = DataFrame({"col1": [3, np.nan], "col2": [4, np.nan]}, - index=Index([1, 2], name="idx")) - tm.assert_frame_equal(out, expected) - - @pytest.mark.parametrize("na_filter", [True, False]) - def test_na_values_with_dtype_str_and_na_filter(self, na_filter): - # see gh-20377 - data = "a,b,c\n1,,3\n4,5,6" - - # na_filter=True --> missing value becomes NaN. - # na_filter=False --> missing value remains empty string. - empty = np.nan if na_filter else "" - expected = DataFrame({"a": ["1", "4"], - "b": [empty, "5"], - "c": ["3", "6"]}) - - result = self.read_csv(StringIO(data), na_filter=na_filter, dtype=str) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/parse_dates.py b/pandas/tests/io/parser/parse_dates.py deleted file mode 100644 index 751fb01e32a6a..0000000000000 --- a/pandas/tests/io/parser/parse_dates.py +++ /dev/null @@ -1,683 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Tests date parsing functionality for all of the -parsers defined in parsers.py -""" - -from datetime import date, datetime -from distutils.version import LooseVersion - -import numpy as np -import pytest -import pytz - -from pandas._libs.tslib import Timestamp -from pandas._libs.tslibs import parsing -import pandas.compat as compat -from pandas.compat import StringIO, lrange, parse_date -from pandas.compat.numpy import np_array_datetime64_compat - -import pandas as pd -from pandas import DataFrame, DatetimeIndex, Index, MultiIndex, Series -from pandas.core.indexes.datetimes import date_range -import pandas.util.testing as tm - -import pandas.io.date_converters as conv -import pandas.io.parsers as parsers - - -class ParseDatesTests(object): - - def test_separator_date_conflict(self): - # Regression test for gh-4678: make sure thousands separator and - # date parsing do not conflict. - data = '06-02-2013;13:00;1-000.215' - expected = DataFrame( - [[datetime(2013, 6, 2, 13, 0, 0), 1000.215]], - columns=['Date', 2] - ) - - df = self.read_csv(StringIO(data), sep=';', thousands='-', - parse_dates={'Date': [0, 1]}, header=None) - tm.assert_frame_equal(df, expected) - - def test_multiple_date_col(self): - # Can use multiple date parsers - data = """\ -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - - def func(*date_cols): - res = parsing.try_parse_dates(parsers._concat_date_cols(date_cols)) - return res - - df = self.read_csv(StringIO(data), header=None, - date_parser=func, - prefix='X', - parse_dates={'nominal': [1, 2], - 'actual': [1, 3]}) - assert 'nominal' in df - assert 'actual' in df - assert 'X1' not in df - assert 'X2' not in df - assert 'X3' not in df - - d = datetime(1999, 1, 27, 19, 0) - assert df.loc[0, 'nominal'] == d - - df = self.read_csv(StringIO(data), header=None, - date_parser=func, - parse_dates={'nominal': [1, 2], - 'actual': [1, 3]}, - keep_date_col=True) - assert 'nominal' in df - assert 'actual' in df - - assert 1 in df - assert 2 in df - assert 3 in df - - data = """\ -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - df = self.read_csv(StringIO(data), header=None, - prefix='X', parse_dates=[[1, 2], [1, 3]]) - - assert 'X1_X2' in df - assert 'X1_X3' in df - assert 'X1' not in df - assert 'X2' not in df - assert 'X3' not in df - - d = datetime(1999, 1, 27, 19, 0) - assert df.loc[0, 'X1_X2'] == d - - df = self.read_csv(StringIO(data), header=None, - parse_dates=[[1, 2], [1, 3]], keep_date_col=True) - - assert '1_2' in df - assert '1_3' in df - assert 1 in df - assert 2 in df - assert 3 in df - - data = '''\ -KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -''' - df = self.read_csv(StringIO(data), sep=',', header=None, - parse_dates=[1], index_col=1) - d = datetime(1999, 1, 27, 19, 0) - assert df.index[0] == d - - def test_multiple_date_cols_int_cast(self): - data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900") - date_spec = {'nominal': [1, 2], 'actual': [1, 3]} - import pandas.io.date_converters as conv - - # it works! - df = self.read_csv(StringIO(data), header=None, parse_dates=date_spec, - date_parser=conv.parse_date_time) - assert 'nominal' in df - - def test_multiple_date_col_timestamp_parse(self): - data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 -05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25""" - result = self.read_csv(StringIO(data), sep=',', header=None, - parse_dates=[[0, 1]], date_parser=Timestamp) - - ex_val = Timestamp('05/31/2012 15:30:00.029') - assert result['0_1'][0] == ex_val - - def test_multiple_date_cols_with_header(self): - data = """\ -ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" - - df = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]}) - assert not isinstance(df.nominal[0], compat.string_types) - - ts_data = """\ -ID,date,nominalTime,actualTime,A,B,C,D,E -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - - def test_multiple_date_col_name_collision(self): - with pytest.raises(ValueError): - self.read_csv(StringIO(self.ts_data), parse_dates={'ID': [1, 2]}) - - data = """\ -date_NominalTime,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir -KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa - - with pytest.raises(ValueError): - self.read_csv(StringIO(data), parse_dates=[[1, 2]]) - - def test_date_parser_int_bug(self): - # See gh-3071 - log_file = StringIO( - 'posix_timestamp,elapsed,sys,user,queries,query_time,rows,' - 'accountid,userid,contactid,level,silo,method\n' - '1343103150,0.062353,0,4,6,0.01690,3,' - '12345,1,-1,3,invoice_InvoiceResource,search\n' - ) - - def f(posix_string): - return datetime.utcfromtimestamp(int(posix_string)) - - # it works! - self.read_csv(log_file, index_col=0, parse_dates=[0], date_parser=f) - - def test_nat_parse(self): - # See gh-3062 - df = DataFrame(dict({ - 'A': np.asarray(lrange(10), dtype='float64'), - 'B': pd.Timestamp('20010101')})) - df.iloc[3:6, :] = np.nan - - with tm.ensure_clean('__nat_parse_.csv') as path: - df.to_csv(path) - result = self.read_csv(path, index_col=0, parse_dates=['B']) - tm.assert_frame_equal(result, df) - - expected = Series(dict(A='float64', B='datetime64[ns]')) - tm.assert_series_equal(expected, result.dtypes) - - # test with NaT for the nan_rep - # we don't have a method to specify the Datetime na_rep - # (it defaults to '') - df.to_csv(path) - result = self.read_csv(path, index_col=0, parse_dates=['B']) - tm.assert_frame_equal(result, df) - - def test_csv_custom_parser(self): - data = """A,B,C -20090101,a,1,2 -20090102,b,3,4 -20090103,c,4,5 -""" - f = lambda x: datetime.strptime(x, '%Y%m%d') - df = self.read_csv(StringIO(data), date_parser=f) - expected = self.read_csv(StringIO(data), parse_dates=True) - tm.assert_frame_equal(df, expected) - - def test_parse_dates_implicit_first_col(self): - data = """A,B,C -20090101,a,1,2 -20090102,b,3,4 -20090103,c,4,5 -""" - df = self.read_csv(StringIO(data), parse_dates=True) - expected = self.read_csv(StringIO(data), index_col=0, parse_dates=True) - assert isinstance( - df.index[0], (datetime, np.datetime64, Timestamp)) - tm.assert_frame_equal(df, expected) - - def test_parse_dates_string(self): - data = """date,A,B,C -20090101,a,1,2 -20090102,b,3,4 -20090103,c,4,5 -""" - rs = self.read_csv( - StringIO(data), index_col='date', parse_dates=['date']) - idx = date_range('1/1/2009', periods=3) - idx.name = 'date' - xp = DataFrame({'A': ['a', 'b', 'c'], - 'B': [1, 3, 4], - 'C': [2, 4, 5]}, idx) - tm.assert_frame_equal(rs, xp) - - def test_yy_format_with_yearfirst(self): - data = """date,time,B,C -090131,0010,1,2 -090228,1020,3,4 -090331,0830,5,6 -""" - - # See gh-217 - import dateutil - if LooseVersion(dateutil.__version__) >= LooseVersion('2.5.0'): - pytest.skip("testing yearfirst=True not-support" - "on datetutil < 2.5.0 this works but" - "is wrong") - - rs = self.read_csv(StringIO(data), index_col=0, - parse_dates=[['date', 'time']]) - idx = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0), - datetime(2009, 2, 28, 10, 20, 0), - datetime(2009, 3, 31, 8, 30, 0)], - dtype=object, name='date_time') - xp = DataFrame({'B': [1, 3, 5], 'C': [2, 4, 6]}, idx) - tm.assert_frame_equal(rs, xp) - - rs = self.read_csv(StringIO(data), index_col=0, - parse_dates=[[0, 1]]) - idx = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0), - datetime(2009, 2, 28, 10, 20, 0), - datetime(2009, 3, 31, 8, 30, 0)], - dtype=object, name='date_time') - xp = DataFrame({'B': [1, 3, 5], 'C': [2, 4, 6]}, idx) - tm.assert_frame_equal(rs, xp) - - def test_parse_dates_column_list(self): - data = 'a,b,c\n01/01/2010,1,15/02/2010' - - expected = DataFrame({'a': [datetime(2010, 1, 1)], 'b': [1], - 'c': [datetime(2010, 2, 15)]}) - expected = expected.set_index(['a', 'b']) - - df = self.read_csv(StringIO(data), index_col=[0, 1], - parse_dates=[0, 2], dayfirst=True) - tm.assert_frame_equal(df, expected) - - df = self.read_csv(StringIO(data), index_col=[0, 1], - parse_dates=['a', 'c'], dayfirst=True) - tm.assert_frame_equal(df, expected) - - def test_multi_index_parse_dates(self): - data = """index1,index2,A,B,C -20090101,one,a,1,2 -20090101,two,b,3,4 -20090101,three,c,4,5 -20090102,one,a,1,2 -20090102,two,b,3,4 -20090102,three,c,4,5 -20090103,one,a,1,2 -20090103,two,b,3,4 -20090103,three,c,4,5 -""" - df = self.read_csv(StringIO(data), index_col=[0, 1], parse_dates=True) - assert isinstance(df.index.levels[0][0], - (datetime, np.datetime64, Timestamp)) - - # specify columns out of order! - df2 = self.read_csv(StringIO(data), index_col=[1, 0], parse_dates=True) - assert isinstance(df2.index.levels[1][0], - (datetime, np.datetime64, Timestamp)) - - def test_parse_dates_custom_euroformat(self): - text = """foo,bar,baz -31/01/2010,1,2 -01/02/2010,1,NA -02/02/2010,1,2 -""" - parser = lambda d: parse_date(d, dayfirst=True) - df = self.read_csv(StringIO(text), - names=['time', 'Q', 'NTU'], header=0, - index_col=0, parse_dates=True, - date_parser=parser, na_values=['NA']) - - exp_index = Index([datetime(2010, 1, 31), datetime(2010, 2, 1), - datetime(2010, 2, 2)], name='time') - expected = DataFrame({'Q': [1, 1, 1], 'NTU': [2, np.nan, 2]}, - index=exp_index, columns=['Q', 'NTU']) - tm.assert_frame_equal(df, expected) - - parser = lambda d: parse_date(d, day_first=True) - pytest.raises(TypeError, self.read_csv, - StringIO(text), skiprows=[0], - names=['time', 'Q', 'NTU'], index_col=0, - parse_dates=True, date_parser=parser, - na_values=['NA']) - - def test_parse_tz_aware(self): - # See gh-1693 - data = StringIO("Date,x\n2012-06-13T01:39:00Z,0.5") - - # it works - result = self.read_csv(data, index_col=0, parse_dates=True) - stamp = result.index[0] - assert stamp.minute == 39 - assert result.index.tz is pytz.utc - - def test_multiple_date_cols_index(self): - data = """ -ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir -KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - - xp = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]}) - df = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]}, - index_col='nominal') - tm.assert_frame_equal(xp.set_index('nominal'), df) - df2 = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]}, - index_col=0) - tm.assert_frame_equal(df2, df) - - df3 = self.read_csv(StringIO(data), parse_dates=[[1, 2]], index_col=0) - tm.assert_frame_equal(df3, df, check_names=False) - - def test_multiple_date_cols_chunked(self): - df = self.read_csv(StringIO(self.ts_data), parse_dates={ - 'nominal': [1, 2]}, index_col='nominal') - reader = self.read_csv(StringIO(self.ts_data), - parse_dates={'nominal': [1, 2]}, - index_col='nominal', chunksize=2) - - chunks = list(reader) - - assert 'nominalTime' not in df - - tm.assert_frame_equal(chunks[0], df[:2]) - tm.assert_frame_equal(chunks[1], df[2:4]) - tm.assert_frame_equal(chunks[2], df[4:]) - - def test_multiple_date_col_named_components(self): - xp = self.read_csv(StringIO(self.ts_data), - parse_dates={'nominal': [1, 2]}, - index_col='nominal') - colspec = {'nominal': ['date', 'nominalTime']} - df = self.read_csv(StringIO(self.ts_data), parse_dates=colspec, - index_col='nominal') - tm.assert_frame_equal(df, xp) - - def test_multiple_date_col_multiple_index(self): - df = self.read_csv(StringIO(self.ts_data), - parse_dates={'nominal': [1, 2]}, - index_col=['nominal', 'ID']) - - xp = self.read_csv(StringIO(self.ts_data), - parse_dates={'nominal': [1, 2]}) - - tm.assert_frame_equal(xp.set_index(['nominal', 'ID']), df) - - def test_read_with_parse_dates_scalar_non_bool(self): - # See gh-5636 - errmsg = ("Only booleans, lists, and " - "dictionaries are accepted " - "for the 'parse_dates' parameter") - data = """A,B,C - 1,2,2003-11-1""" - - with pytest.raises(TypeError, match=errmsg): - self.read_csv(StringIO(data), parse_dates="C") - with pytest.raises(TypeError, match=errmsg): - self.read_csv(StringIO(data), parse_dates="C", index_col="C") - - def test_read_with_parse_dates_invalid_type(self): - errmsg = ("Only booleans, lists, and " - "dictionaries are accepted " - "for the 'parse_dates' parameter") - data = """A,B,C - 1,2,2003-11-1""" - - with pytest.raises(TypeError, match=errmsg): - self.read_csv(StringIO(data), parse_dates=(1,)) - with pytest.raises(TypeError, match=errmsg): - self.read_csv(StringIO(data), parse_dates=np.array([4, 5])) - with pytest.raises(TypeError, match=errmsg): - self.read_csv(StringIO(data), parse_dates={1, 3, 3}) - - def test_parse_dates_empty_string(self): - # see gh-2263 - data = "Date, test\n2012-01-01, 1\n,2" - result = self.read_csv(StringIO(data), parse_dates=["Date"], - na_filter=False) - assert result['Date'].isna()[1] - - def test_parse_dates_noconvert_thousands(self): - # see gh-14066 - data = 'a\n04.15.2016' - - expected = DataFrame([datetime(2016, 4, 15)], columns=['a']) - result = self.read_csv(StringIO(data), parse_dates=['a'], - thousands='.') - tm.assert_frame_equal(result, expected) - - exp_index = DatetimeIndex(['2016-04-15'], name='a') - expected = DataFrame(index=exp_index) - result = self.read_csv(StringIO(data), index_col=0, - parse_dates=True, thousands='.') - tm.assert_frame_equal(result, expected) - - data = 'a,b\n04.15.2016,09.16.2013' - - expected = DataFrame([[datetime(2016, 4, 15), - datetime(2013, 9, 16)]], - columns=['a', 'b']) - result = self.read_csv(StringIO(data), parse_dates=['a', 'b'], - thousands='.') - tm.assert_frame_equal(result, expected) - - expected = DataFrame([[datetime(2016, 4, 15), - datetime(2013, 9, 16)]], - columns=['a', 'b']) - expected = expected.set_index(['a', 'b']) - result = self.read_csv(StringIO(data), index_col=[0, 1], - parse_dates=True, thousands='.') - tm.assert_frame_equal(result, expected) - - def test_parse_date_time_multi_level_column_name(self): - data = """\ -D,T,A,B -date, time,a,b -2001-01-05, 09:00:00, 0.0, 10. -2001-01-06, 00:00:00, 1.0, 11. -""" - datecols = {'date_time': [0, 1]} - result = self.read_csv(StringIO(data), sep=',', header=[0, 1], - parse_dates=datecols, - date_parser=conv.parse_date_time) - - expected_data = [[datetime(2001, 1, 5, 9, 0, 0), 0., 10.], - [datetime(2001, 1, 6, 0, 0, 0), 1., 11.]] - expected = DataFrame(expected_data, - columns=['date_time', ('A', 'a'), ('B', 'b')]) - tm.assert_frame_equal(result, expected) - - def test_parse_date_time(self): - dates = np.array(['2007/1/3', '2008/2/4'], dtype=object) - times = np.array(['05:07:09', '06:08:00'], dtype=object) - expected = np.array([datetime(2007, 1, 3, 5, 7, 9), - datetime(2008, 2, 4, 6, 8, 0)]) - - result = conv.parse_date_time(dates, times) - assert (result == expected).all() - - data = """\ -date, time, a, b -2001-01-05, 10:00:00, 0.0, 10. -2001-01-05, 00:00:00, 1., 11. -""" - datecols = {'date_time': [0, 1]} - df = self.read_csv(StringIO(data), sep=',', header=0, - parse_dates=datecols, - date_parser=conv.parse_date_time) - assert 'date_time' in df - assert df.date_time.loc[0] == datetime(2001, 1, 5, 10, 0, 0) - - data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900") - - date_spec = {'nominal': [1, 2], 'actual': [1, 3]} - df = self.read_csv(StringIO(data), header=None, parse_dates=date_spec, - date_parser=conv.parse_date_time) - - def test_parse_date_fields(self): - years = np.array([2007, 2008]) - months = np.array([1, 2]) - days = np.array([3, 4]) - result = conv.parse_date_fields(years, months, days) - expected = np.array([datetime(2007, 1, 3), datetime(2008, 2, 4)]) - assert (result == expected).all() - - data = ("year, month, day, a\n 2001 , 01 , 10 , 10.\n" - "2001 , 02 , 1 , 11.") - datecols = {'ymd': [0, 1, 2]} - df = self.read_csv(StringIO(data), sep=',', header=0, - parse_dates=datecols, - date_parser=conv.parse_date_fields) - assert 'ymd' in df - assert df.ymd.loc[0] == datetime(2001, 1, 10) - - def test_datetime_six_col(self): - years = np.array([2007, 2008]) - months = np.array([1, 2]) - days = np.array([3, 4]) - hours = np.array([5, 6]) - minutes = np.array([7, 8]) - seconds = np.array([9, 0]) - expected = np.array([datetime(2007, 1, 3, 5, 7, 9), - datetime(2008, 2, 4, 6, 8, 0)]) - - result = conv.parse_all_fields(years, months, days, - hours, minutes, seconds) - - assert (result == expected).all() - - data = """\ -year, month, day, hour, minute, second, a, b -2001, 01, 05, 10, 00, 0, 0.0, 10. -2001, 01, 5, 10, 0, 00, 1., 11. -""" - datecols = {'ymdHMS': [0, 1, 2, 3, 4, 5]} - df = self.read_csv(StringIO(data), sep=',', header=0, - parse_dates=datecols, - date_parser=conv.parse_all_fields) - assert 'ymdHMS' in df - assert df.ymdHMS.loc[0] == datetime(2001, 1, 5, 10, 0, 0) - - def test_datetime_fractional_seconds(self): - data = """\ -year, month, day, hour, minute, second, a, b -2001, 01, 05, 10, 00, 0.123456, 0.0, 10. -2001, 01, 5, 10, 0, 0.500000, 1., 11. -""" - datecols = {'ymdHMS': [0, 1, 2, 3, 4, 5]} - df = self.read_csv(StringIO(data), sep=',', header=0, - parse_dates=datecols, - date_parser=conv.parse_all_fields) - assert 'ymdHMS' in df - assert df.ymdHMS.loc[0] == datetime(2001, 1, 5, 10, 0, 0, - microsecond=123456) - assert df.ymdHMS.loc[1] == datetime(2001, 1, 5, 10, 0, 0, - microsecond=500000) - - def test_generic(self): - data = "year, month, day, a\n 2001, 01, 10, 10.\n 2001, 02, 1, 11." - datecols = {'ym': [0, 1]} - dateconverter = lambda y, m: date(year=int(y), month=int(m), day=1) - df = self.read_csv(StringIO(data), sep=',', header=0, - parse_dates=datecols, - date_parser=dateconverter) - assert 'ym' in df - assert df.ym.loc[0] == date(2001, 1, 1) - - def test_dateparser_resolution_if_not_ns(self): - # GH 10245 - data = """\ -date,time,prn,rxstatus -2013-11-03,19:00:00,126,00E80000 -2013-11-03,19:00:00,23,00E80000 -2013-11-03,19:00:00,13,00E80000 -""" - - def date_parser(date, time): - datetime = np_array_datetime64_compat( - date + 'T' + time + 'Z', dtype='datetime64[s]') - return datetime - - df = self.read_csv(StringIO(data), date_parser=date_parser, - parse_dates={'datetime': ['date', 'time']}, - index_col=['datetime', 'prn']) - - datetimes = np_array_datetime64_compat(['2013-11-03T19:00:00Z'] * 3, - dtype='datetime64[s]') - df_correct = DataFrame(data={'rxstatus': ['00E80000'] * 3}, - index=MultiIndex.from_tuples( - [(datetimes[0], 126), - (datetimes[1], 23), - (datetimes[2], 13)], - names=['datetime', 'prn'])) - tm.assert_frame_equal(df, df_correct) - - def test_parse_date_column_with_empty_string(self): - # GH 6428 - data = """case,opdate - 7,10/18/2006 - 7,10/18/2008 - 621, """ - result = self.read_csv(StringIO(data), parse_dates=['opdate']) - expected_data = [[7, '10/18/2006'], - [7, '10/18/2008'], - [621, ' ']] - expected = DataFrame(expected_data, columns=['case', 'opdate']) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("data,expected", [ - ("a\n135217135789158401\n1352171357E+5", - DataFrame({"a": [135217135789158401, - 135217135700000]}, dtype="float64")), - ("a\n99999999999\n123456789012345\n1234E+0", - DataFrame({"a": [99999999999, - 123456789012345, - 1234]}, dtype="float64")) - ]) - @pytest.mark.parametrize("parse_dates", [True, False]) - def test_parse_date_float(self, data, expected, parse_dates): - # see gh-2697 - # - # Date parsing should fail, so we leave the data untouched - # (i.e. float precision should remain unchanged). - result = self.read_csv(StringIO(data), parse_dates=parse_dates) - tm.assert_frame_equal(result, expected) - - def test_parse_timezone(self): - # gh-22256 - data = """dt,val - 2018-01-04 09:01:00+09:00,23350 - 2018-01-04 09:02:00+09:00,23400 - 2018-01-04 09:03:00+09:00,23400 - 2018-01-04 09:04:00+09:00,23400 - 2018-01-04 09:05:00+09:00,23400""" - parsed = self.read_csv(StringIO(data), parse_dates=['dt']) - dti = pd.DatetimeIndex(start='2018-01-04 09:01:00', - end='2018-01-04 09:05:00', freq='1min', - tz=pytz.FixedOffset(540)) - expected_data = {'dt': dti, 'val': [23350, 23400, 23400, 23400, 23400]} - expected = DataFrame(expected_data) - tm.assert_frame_equal(parsed, expected) diff --git a/pandas/tests/io/parser/skiprows.py b/pandas/tests/io/parser/skiprows.py deleted file mode 100644 index a051ee9b22d10..0000000000000 --- a/pandas/tests/io/parser/skiprows.py +++ /dev/null @@ -1,226 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Tests that skipped rows are properly handled during -parsing for all of the parsers defined in parsers.py -""" - -from datetime import datetime - -import numpy as np -import pytest - -from pandas.compat import StringIO, lrange, range -from pandas.errors import EmptyDataError - -from pandas import DataFrame -import pandas.util.testing as tm - - -class SkipRowsTests(object): - - def test_skiprows_bug(self): - # see gh-505 - text = """#foo,a,b,c -#foo,a,b,c -#foo,a,b,c -#foo,a,b,c -#foo,a,b,c -#foo,a,b,c -1/1/2000,1.,2.,3. -1/2/2000,4,5,6 -1/3/2000,7,8,9 -""" - data = self.read_csv(StringIO(text), skiprows=lrange(6), header=None, - index_col=0, parse_dates=True) - - data2 = self.read_csv(StringIO(text), skiprows=6, header=None, - index_col=0, parse_dates=True) - - expected = DataFrame(np.arange(1., 10.).reshape((3, 3)), - columns=[1, 2, 3], - index=[datetime(2000, 1, 1), datetime(2000, 1, 2), - datetime(2000, 1, 3)]) - expected.index.name = 0 - tm.assert_frame_equal(data, expected) - tm.assert_frame_equal(data, data2) - - def test_deep_skiprows(self): - # see gh-4382 - text = "a,b,c\n" + \ - "\n".join([",".join([str(i), str(i + 1), str(i + 2)]) - for i in range(10)]) - condensed_text = "a,b,c\n" + \ - "\n".join([",".join([str(i), str(i + 1), str(i + 2)]) - for i in [0, 1, 2, 3, 4, 6, 8, 9]]) - data = self.read_csv(StringIO(text), skiprows=[6, 8]) - condensed_data = self.read_csv(StringIO(condensed_text)) - tm.assert_frame_equal(data, condensed_data) - - def test_skiprows_blank(self): - # see gh-9832 - text = """#foo,a,b,c -#foo,a,b,c - -#foo,a,b,c -#foo,a,b,c - -1/1/2000,1.,2.,3. -1/2/2000,4,5,6 -1/3/2000,7,8,9 -""" - data = self.read_csv(StringIO(text), skiprows=6, header=None, - index_col=0, parse_dates=True) - - expected = DataFrame(np.arange(1., 10.).reshape((3, 3)), - columns=[1, 2, 3], - index=[datetime(2000, 1, 1), datetime(2000, 1, 2), - datetime(2000, 1, 3)]) - expected.index.name = 0 - tm.assert_frame_equal(data, expected) - - def test_skiprow_with_newline(self): - # see gh-12775 and gh-10911 - data = """id,text,num_lines -1,"line 11 -line 12",2 -2,"line 21 -line 22",2 -3,"line 31",1""" - expected = [[2, 'line 21\nline 22', 2], - [3, 'line 31', 1]] - expected = DataFrame(expected, columns=[ - 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(df, expected) - - data = ('a,b,c\n~a\n b~,~e\n d~,' - '~f\n f~\n1,2,~12\n 13\n 14~') - expected = [['a\n b', 'e\n d', 'f\n f']] - expected = DataFrame(expected, columns=[ - 'a', 'b', 'c']) - df = self.read_csv(StringIO(data), - quotechar="~", - skiprows=[2]) - tm.assert_frame_equal(df, expected) - - data = ('Text,url\n~example\n ' - 'sentence\n one~,url1\n~' - 'example\n sentence\n two~,url2\n~' - 'example\n sentence\n three~,url3') - expected = [['example\n sentence\n two', 'url2']] - expected = DataFrame(expected, columns=[ - 'Text', 'url']) - df = self.read_csv(StringIO(data), - quotechar="~", - skiprows=[1, 3]) - tm.assert_frame_equal(df, expected) - - def test_skiprow_with_quote(self): - # see gh-12775 and gh-10911 - data = """id,text,num_lines -1,"line '11' line 12",2 -2,"line '21' line 22",2 -3,"line '31' line 32",1""" - expected = [[2, "line '21' line 22", 2], - [3, "line '31' line 32", 1]] - expected = DataFrame(expected, columns=[ - 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(df, expected) - - def test_skiprow_with_newline_and_quote(self): - # see gh-12775 and gh-10911 - data = """id,text,num_lines -1,"line \n'11' line 12",2 -2,"line \n'21' line 22",2 -3,"line \n'31' line 32",1""" - expected = [[2, "line \n'21' line 22", 2], - [3, "line \n'31' line 32", 1]] - expected = DataFrame(expected, columns=[ - 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(df, expected) - - data = """id,text,num_lines -1,"line '11\n' line 12",2 -2,"line '21\n' line 22",2 -3,"line '31\n' line 32",1""" - expected = [[2, "line '21\n' line 22", 2], - [3, "line '31\n' line 32", 1]] - expected = DataFrame(expected, columns=[ - 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(df, expected) - - data = """id,text,num_lines -1,"line '11\n' \r\tline 12",2 -2,"line '21\n' \r\tline 22",2 -3,"line '31\n' \r\tline 32",1""" - expected = [[2, "line '21\n' \r\tline 22", 2], - [3, "line '31\n' \r\tline 32", 1]] - expected = DataFrame(expected, columns=[ - 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(df, expected) - - def test_skiprows_lineterminator(self): - # see gh-9079 - data = '\n'.join(['SMOSMANIA ThetaProbe-ML2X ', - '2007/01/01 01:00 0.2140 U M ', - '2007/01/01 02:00 0.2141 M O ', - '2007/01/01 04:00 0.2142 D M ']) - expected = DataFrame([['2007/01/01', '01:00', 0.2140, 'U', 'M'], - ['2007/01/01', '02:00', 0.2141, 'M', 'O'], - ['2007/01/01', '04:00', 0.2142, 'D', 'M']], - columns=['date', 'time', 'var', 'flag', - 'oflag']) - - # test with default line terminators "LF" and "CRLF" - df = self.read_csv(StringIO(data), skiprows=1, delim_whitespace=True, - names=['date', 'time', 'var', 'flag', 'oflag']) - tm.assert_frame_equal(df, expected) - - df = self.read_csv(StringIO(data.replace('\n', '\r\n')), - skiprows=1, delim_whitespace=True, - names=['date', 'time', 'var', 'flag', 'oflag']) - tm.assert_frame_equal(df, expected) - - # "CR" is not respected with the Python parser yet - if self.engine == 'c': - df = self.read_csv(StringIO(data.replace('\n', '\r')), - skiprows=1, delim_whitespace=True, - names=['date', 'time', 'var', 'flag', 'oflag']) - tm.assert_frame_equal(df, expected) - - def test_skiprows_infield_quote(self): - # see gh-14459 - data = 'a"\nb"\na\n1' - expected = DataFrame({'a': [1]}) - - df = self.read_csv(StringIO(data), skiprows=2) - tm.assert_frame_equal(df, expected) - - def test_skiprows_callable(self): - data = 'a\n1\n2\n3\n4\n5' - - skiprows = lambda x: x % 2 == 0 - expected = DataFrame({'1': [3, 5]}) - df = self.read_csv(StringIO(data), skiprows=skiprows) - tm.assert_frame_equal(df, expected) - - expected = DataFrame({'foo': [3, 5]}) - df = self.read_csv(StringIO(data), skiprows=skiprows, - header=0, names=['foo']) - tm.assert_frame_equal(df, expected) - - skiprows = lambda x: True - msg = "No columns to parse from file" - with pytest.raises(EmptyDataError, match=msg): - self.read_csv(StringIO(data), skiprows=skiprows) - - # This is a bad callable and should raise. - msg = "by zero" - skiprows = lambda x: 1 / 0 - with pytest.raises(ZeroDivisionError, match=msg): - self.read_csv(StringIO(data), skiprows=skiprows) diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py new file mode 100644 index 0000000000000..fbf23f769e202 --- /dev/null +++ b/pandas/tests/io/parser/test_multi_thread.py @@ -0,0 +1,145 @@ +# -*- coding: utf-8 -*- + +""" +Tests multithreading behaviour for reading and +parsing files for each parser defined in parsers.py +""" + +from __future__ import division + +from multiprocessing.pool import ThreadPool + +import numpy as np + +from pandas.compat import BytesIO, range + +import pandas as pd +from pandas import DataFrame +import pandas.util.testing as tm + + +def _construct_dataframe(num_rows): + """ + Construct a DataFrame for testing. + + Parameters + ---------- + num_rows : int + The number of rows for our DataFrame. + + Returns + ------- + df : DataFrame + """ + df = DataFrame(np.random.rand(num_rows, 5), columns=list("abcde")) + df["foo"] = "foo" + df["bar"] = "bar" + df["baz"] = "baz" + df["date"] = pd.date_range("20000101 09:00:00", + periods=num_rows, + freq="s") + df["int"] = np.arange(num_rows, dtype="int64") + return df + + +def test_multi_thread_string_io_read_csv(all_parsers): + # see gh-11786 + parser = all_parsers + max_row_range = 10000 + num_files = 100 + + bytes_to_df = [ + "\n".join( + ["%d,%d,%d" % (i, i, i) for i in range(max_row_range)] + ).encode() for _ in range(num_files)] + files = [BytesIO(b) for b in bytes_to_df] + + # Read all files in many threads. + pool = ThreadPool(8) + + results = pool.map(parser.read_csv, files) + first_result = results[0] + + for result in results: + tm.assert_frame_equal(first_result, result) + + +def _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks): + """ + Generate a DataFrame via multi-thread. + + Parameters + ---------- + parser : BaseParser + The parser object to use for reading the data. + path : str + The location of the CSV file to read. + num_rows : int + The number of rows to read per task. + num_tasks : int + The number of tasks to use for reading this DataFrame. + + Returns + ------- + df : DataFrame + """ + def reader(arg): + """ + Create a reader for part of the CSV. + + Parameters + ---------- + arg : tuple + A tuple of the following: + + * start : int + The starting row to start for parsing CSV + * nrows : int + The number of rows to read. + + Returns + ------- + df : DataFrame + """ + start, nrows = arg + + if not start: + return parser.read_csv(path, index_col=0, header=0, + nrows=nrows, parse_dates=["date"]) + + return parser.read_csv(path, index_col=0, header=None, + skiprows=int(start) + 1, + nrows=nrows, parse_dates=[9]) + + tasks = [ + (num_rows * i // num_tasks, + num_rows // num_tasks) for i in range(num_tasks) + ] + + pool = ThreadPool(processes=num_tasks) + results = pool.map(reader, tasks) + + header = results[0].columns + + for r in results[1:]: + r.columns = header + + final_dataframe = pd.concat(results) + return final_dataframe + + +def test_multi_thread_path_multipart_read_csv(all_parsers): + # see gh-11786 + num_tasks = 4 + num_rows = 100000 + + parser = all_parsers + file_name = "__thread_pool_reader__.csv" + df = _construct_dataframe(num_rows) + + with tm.ensure_clean(file_name) as path: + df.to_csv(path) + + final_dataframe = _generate_multi_thread_dataframe(parser, path, + num_rows, num_tasks) + tm.assert_frame_equal(df, final_dataframe) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py new file mode 100644 index 0000000000000..921984bc44e50 --- /dev/null +++ b/pandas/tests/io/parser/test_na_values.py @@ -0,0 +1,423 @@ +# -*- coding: utf-8 -*- + +""" +Tests that NA values are properly handled during +parsing for all of the parsers defined in parsers.py +""" + +import numpy as np +import pytest + +from pandas.compat import StringIO, range + +from pandas import DataFrame, Index, MultiIndex +import pandas.util.testing as tm + +import pandas.io.common as com + + +def test_string_nas(all_parsers): + parser = all_parsers + data = """A,B,C +a,b,c +d,,f +,g,h +""" + result = parser.read_csv(StringIO(data)) + expected = DataFrame([["a", "b", "c"], + ["d", np.nan, "f"], + [np.nan, "g", "h"]], + columns=["A", "B", "C"]) + tm.assert_frame_equal(result, expected) + + +def test_detect_string_na(all_parsers): + parser = all_parsers + data = """A,B +foo,bar +NA,baz +NaN,nan +""" + expected = DataFrame([["foo", "bar"], [np.nan, "baz"], + [np.nan, np.nan]], columns=["A", "B"]) + result = parser.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("na_values", [ + ["-999.0", "-999"], + [-999, -999.0], + [-999.0, -999], + ["-999.0"], ["-999"], + [-999.0], [-999] +]) +@pytest.mark.parametrize("data", [ + """A,B +-999,1.2 +2,-999 +3,4.5 +""", + """A,B +-999,1.200 +2,-999.000 +3,4.500 +""" +]) +def test_non_string_na_values(all_parsers, data, na_values): + # see gh-3611: with an odd float format, we can't match + # the string "999.0" exactly but still need float matching + parser = all_parsers + expected = DataFrame([[np.nan, 1.2], [2.0, np.nan], + [3.0, 4.5]], columns=["A", "B"]) + + result = parser.read_csv(StringIO(data), na_values=na_values) + tm.assert_frame_equal(result, expected) + + +def test_default_na_values(all_parsers): + _NA_VALUES = {"-1.#IND", "1.#QNAN", "1.#IND", "-1.#QNAN", "#N/A", + "N/A", "n/a", "NA", "#NA", "NULL", "null", "NaN", "nan", + "-NaN", "-nan", "#N/A N/A", ""} + assert _NA_VALUES == com._NA_VALUES + + parser = all_parsers + nv = len(_NA_VALUES) + + def f(i, v): + if i == 0: + buf = "" + elif i > 0: + buf = "".join([","] * i) + + buf = "{0}{1}".format(buf, v) + + if i < nv - 1: + buf = "{0}{1}".format(buf, "".join([","] * (nv - i - 1))) + + return buf + + data = StringIO("\n".join(f(i, v) for i, v in enumerate(_NA_VALUES))) + expected = DataFrame(np.nan, columns=range(nv), index=range(nv)) + + result = parser.read_csv(data, header=None) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("na_values", ["baz", ["baz"]]) +def test_custom_na_values(all_parsers, na_values): + parser = all_parsers + data = """A,B,C +ignore,this,row +1,NA,3 +-1.#IND,5,baz +7,8,NaN +""" + expected = DataFrame([[1., np.nan, 3], [np.nan, 5, np.nan], + [7, 8, np.nan]], columns=["A", "B", "C"]) + result = parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1]) + tm.assert_frame_equal(result, expected) + + +def test_bool_na_values(all_parsers): + data = """A,B,C +True,False,True +NA,True,False +False,NA,True""" + parser = all_parsers + result = parser.read_csv(StringIO(data)) + expected = DataFrame({"A": np.array([True, np.nan, False], dtype=object), + "B": np.array([False, True, np.nan], dtype=object), + "C": [True, False, True]}) + tm.assert_frame_equal(result, expected) + + +def test_na_value_dict(all_parsers): + data = """A,B,C +foo,bar,NA +bar,foo,foo +foo,bar,NA +bar,foo,foo""" + parser = all_parsers + df = parser.read_csv(StringIO(data), + na_values={"A": ["foo"], "B": ["bar"]}) + expected = DataFrame({"A": [np.nan, "bar", np.nan, "bar"], + "B": [np.nan, "foo", np.nan, "foo"], + "C": [np.nan, "foo", np.nan, "foo"]}) + tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("index_col,expected", [ + ([0], DataFrame({"b": [np.nan], "c": [1], "d": [5]}, + index=Index([0], name="a"))), + ([0, 2], DataFrame({"b": [np.nan], "d": [5]}, + index=MultiIndex.from_tuples( + [(0, 1)], names=["a", "c"]))), + (["a", "c"], DataFrame({"b": [np.nan], "d": [5]}, + index=MultiIndex.from_tuples( + [(0, 1)], names=["a", "c"]))), +]) +def test_na_value_dict_multi_index(all_parsers, index_col, expected): + data = """\ +a,b,c,d +0,NA,1,5 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), na_values=set(), + index_col=index_col) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("kwargs,expected", [ + (dict(), DataFrame({"A": ["a", "b", np.nan, "d", "e", np.nan, "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", np.nan, "five", + np.nan, "seven"]})), + (dict(na_values={"A": [], "C": []}, keep_default_na=False), + DataFrame({"A": ["a", "b", "", "d", "e", "nan", "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", "nan", "five", "", "seven"]})), + (dict(na_values=["a"], keep_default_na=False), + DataFrame({"A": [np.nan, "b", "", "d", "e", "nan", "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", "nan", "five", "", "seven"]})), + (dict(na_values={"A": [], "C": []}), + DataFrame({"A": ["a", "b", np.nan, "d", "e", np.nan, "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", np.nan, + "five", np.nan, "seven"]})), +]) +def test_na_values_keep_default(all_parsers, kwargs, expected): + data = """\ +A,B,C +a,1,one +b,2,two +,3,three +d,4,nan +e,5,five +nan,6, +g,7,seven +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_no_na_values_no_keep_default(all_parsers): + # see gh-4318: passing na_values=None and + # keep_default_na=False yields 'None" as a na_value + data = """\ +A,B,C +a,1,None +b,2,two +,3,None +d,4,nan +e,5,five +nan,6, +g,7,seven +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), keep_default_na=False) + + expected = DataFrame({"A": ["a", "b", "", "d", "e", "nan", "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["None", "two", "None", "nan", + "five", "", "seven"]}) + tm.assert_frame_equal(result, expected) + + +def test_no_keep_default_na_dict_na_values(all_parsers): + # see gh-19227 + data = "a,b\n,2" + parser = all_parsers + result = parser.read_csv(StringIO(data), na_values={"b": ["2"]}, + keep_default_na=False) + expected = DataFrame({"a": [""], "b": [np.nan]}) + tm.assert_frame_equal(result, expected) + + +def test_no_keep_default_na_dict_na_scalar_values(all_parsers): + # see gh-19227 + # + # Scalar values shouldn't cause the parsing to crash or fail. + data = "a,b\n1,2" + parser = all_parsers + df = parser.read_csv(StringIO(data), na_values={"b": 2}, + keep_default_na=False) + expected = DataFrame({"a": [1], "b": [np.nan]}) + tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("col_zero_na_values", [ + 113125, "113125" +]) +def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, + col_zero_na_values): + # see gh-19227 + data = """\ +113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008 +729639,"qwer","",asdfkj,466.681,,252.373 +""" + parser = all_parsers + expected = DataFrame({0: [np.nan, 729639.0], + 1: [np.nan, "qwer"], + 2: ["/blaha", np.nan], + 3: ["kjsdkj", "asdfkj"], + 4: [412.166, 466.681], + 5: ["225.874", ""], + 6: [np.nan, 252.373]}) + + result = parser.read_csv(StringIO(data), header=None, + keep_default_na=False, + na_values={2: "", 6: "214.008", + 1: "blah", 0: col_zero_na_values}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("na_filter,row_data", [ + (True, [[1, "A"], [np.nan, np.nan], [3, "C"]]), + (False, [["1", "A"], ["nan", "B"], ["3", "C"]]), +]) +def test_na_values_na_filter_override(all_parsers, na_filter, row_data): + data = """\ +A,B +1,A +nan,B +3,C +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), na_values=["B"], + na_filter=na_filter) + + expected = DataFrame(row_data, columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + +def test_na_trailing_columns(all_parsers): + parser = all_parsers + data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax +2012-03-14,USD,AAPL,BUY,1000 +2012-05-12,USD,SBUX,SELL,500""" + + # Trailing columns should be all NaN. + result = parser.read_csv(StringIO(data)) + expected = DataFrame([ + ["2012-03-14", "USD", "AAPL", "BUY", 1000, np.nan, np.nan, np.nan], + ["2012-05-12", "USD", "SBUX", "SELL", 500, np.nan, np.nan, np.nan], + ], columns=["Date", "Currency", "Symbol", "Type", + "Units", "UnitPrice", "Cost", "Tax"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("na_values,row_data", [ + (1, [[np.nan, 2.0], [2.0, np.nan]]), + ({"a": 2, "b": 1}, [[1.0, 2.0], [np.nan, np.nan]]), +]) +def test_na_values_scalar(all_parsers, na_values, row_data): + # see gh-12224 + parser = all_parsers + names = ["a", "b"] + data = "1,2\n2,1" + + result = parser.read_csv(StringIO(data), names=names, na_values=na_values) + expected = DataFrame(row_data, columns=names) + tm.assert_frame_equal(result, expected) + + +def test_na_values_dict_aliasing(all_parsers): + parser = all_parsers + na_values = {"a": 2, "b": 1} + na_values_copy = na_values.copy() + + names = ["a", "b"] + data = "1,2\n2,1" + + expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names) + result = parser.read_csv(StringIO(data), names=names, na_values=na_values) + + tm.assert_frame_equal(result, expected) + tm.assert_dict_equal(na_values, na_values_copy) + + +def test_na_values_dict_col_index(all_parsers): + # see gh-14203 + data = "a\nfoo\n1" + parser = all_parsers + na_values = {0: "foo"} + + result = parser.read_csv(StringIO(data), na_values=na_values) + expected = DataFrame({"a": [np.nan, 1]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("data,kwargs,expected", [ + (str(2**63) + "\n" + str(2**63 + 1), + dict(na_values=[2**63]), DataFrame([str(2**63), str(2**63 + 1)])), + (str(2**63) + ",1" + "\n,2", + dict(), DataFrame([[str(2**63), 1], ['', 2]])), + (str(2**63) + "\n1", + dict(na_values=[2**63]), DataFrame([np.nan, 1])), +]) +def test_na_values_uint64(all_parsers, data, kwargs, expected): + # see gh-14983 + parser = all_parsers + result = parser.read_csv(StringIO(data), header=None, **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_empty_na_values_no_default_with_index(all_parsers): + # see gh-15835 + data = "a,1\nb,2" + parser = all_parsers + expected = DataFrame({"1": [2]}, index=Index(["b"], name="a")) + + result = parser.read_csv(StringIO(data), index_col=0, + keep_default_na=False) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("na_filter,index_data", [ + (False, ["", "5"]), + (True, [np.nan, 5.0]), +]) +def test_no_na_filter_on_index(all_parsers, na_filter, index_data): + # see gh-5239 + # + # Don't parse NA-values in index unless na_filter=True + parser = all_parsers + data = "a,b,c\n1,,3\n4,5,6" + + expected = DataFrame({"a": [1, 4], "c": [3, 6]}, + index=Index(index_data, name="b")) + result = parser.read_csv(StringIO(data), index_col=[1], + na_filter=na_filter) + tm.assert_frame_equal(result, expected) + + +def test_inf_na_values_with_int_index(all_parsers): + # see gh-17128 + parser = all_parsers + data = "idx,col1,col2\n1,3,4\n2,inf,-inf" + + # Don't fail with OverflowError with inf's and integer index column. + out = parser.read_csv(StringIO(data), index_col=[0], + na_values=["inf", "-inf"]) + expected = DataFrame({"col1": [3, np.nan], "col2": [4, np.nan]}, + index=Index([1, 2], name="idx")) + tm.assert_frame_equal(out, expected) + + +@pytest.mark.parametrize("na_filter", [True, False]) +def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): + # see gh-20377 + parser = all_parsers + data = "a,b,c\n1,,3\n4,5,6" + + # na_filter=True --> missing value becomes NaN. + # na_filter=False --> missing value remains empty string. + empty = np.nan if na_filter else "" + expected = DataFrame({"a": ["1", "4"], + "b": [empty, "5"], + "c": ["3", "6"]}) + + result = parser.read_csv(StringIO(data), na_filter=na_filter, dtype=str) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py new file mode 100644 index 0000000000000..e70ae03e007ee --- /dev/null +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -0,0 +1,849 @@ +# -*- coding: utf-8 -*- + +""" +Tests date parsing functionality for all of the +parsers defined in parsers.py +""" + +from datetime import date, datetime + +import numpy as np +import pytest +import pytz + +from pandas._libs.tslib import Timestamp +from pandas._libs.tslibs import parsing +from pandas.compat import StringIO, lrange, parse_date +from pandas.compat.numpy import np_array_datetime64_compat + +import pandas as pd +from pandas import DataFrame, DatetimeIndex, Index, MultiIndex +from pandas.core.indexes.datetimes import date_range +import pandas.util.testing as tm + +import pandas.io.date_converters as conv +import pandas.io.parsers as parsers + + +def test_separator_date_conflict(all_parsers): + # Regression test for gh-4678 + # + # Make sure thousands separator and + # date parsing do not conflict. + parser = all_parsers + data = "06-02-2013;13:00;1-000.215" + expected = DataFrame([[datetime(2013, 6, 2, 13, 0, 0), 1000.215]], + columns=["Date", 2]) + + df = parser.read_csv(StringIO(data), sep=";", thousands="-", + parse_dates={"Date": [0, 1]}, header=None) + tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("keep_date_col", [True, False]) +def test_multiple_date_col_custom(all_parsers, keep_date_col): + data = """\ +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + parser = all_parsers + + def date_parser(*date_cols): + """ + Test date parser. + + Parameters + ---------- + date_cols : args + The list of data columns to parse. + + Returns + ------- + parsed : Series + """ + return parsing.try_parse_dates(parsers._concat_date_cols(date_cols)) + + result = parser.read_csv(StringIO(data), header=None, + date_parser=date_parser, prefix="X", + parse_dates={"actual": [1, 2], + "nominal": [1, 3]}, + keep_date_col=keep_date_col) + expected = DataFrame([ + [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), + "KORD", "19990127", " 19:00:00", " 18:56:00", + 0.81, 2.81, 7.2, 0.0, 280.0], + [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), + "KORD", "19990127", " 20:00:00", " 19:56:00", + 0.01, 2.21, 7.2, 0.0, 260.0], + [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56), + "KORD", "19990127", " 21:00:00", " 20:56:00", + -0.59, 2.21, 5.7, 0.0, 280.0], + [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18), + "KORD", "19990127", " 21:00:00", " 21:18:00", + -0.99, 2.01, 3.6, 0.0, 270.0], + [datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56), + "KORD", "19990127", " 22:00:00", " 21:56:00", + -0.59, 1.71, 5.1, 0.0, 290.0], + [datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56), + "KORD", "19990127", " 23:00:00", " 22:56:00", + -0.59, 1.71, 4.6, 0.0, 280.0], + ], columns=["actual", "nominal", "X0", "X1", "X2", + "X3", "X4", "X5", "X6", "X7", "X8"]) + + if not keep_date_col: + expected = expected.drop(["X1", "X2", "X3"], axis=1) + elif parser.engine == "python": + expected["X1"] = expected["X1"].astype(np.int64) + + # Python can sometimes be flaky about how + # the aggregated columns are entered, so + # this standardizes the order. + result = result[expected.columns] + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("keep_date_col", [True, False]) +def test_multiple_date_col(all_parsers, keep_date_col): + data = """\ +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), header=None, + prefix="X", parse_dates=[[1, 2], [1, 3]], + keep_date_col=keep_date_col) + expected = DataFrame([ + [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), + "KORD", "19990127", " 19:00:00", " 18:56:00", + 0.81, 2.81, 7.2, 0.0, 280.0], + [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), + "KORD", "19990127", " 20:00:00", " 19:56:00", + 0.01, 2.21, 7.2, 0.0, 260.0], + [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56), + "KORD", "19990127", " 21:00:00", " 20:56:00", + -0.59, 2.21, 5.7, 0.0, 280.0], + [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18), + "KORD", "19990127", " 21:00:00", " 21:18:00", + -0.99, 2.01, 3.6, 0.0, 270.0], + [datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56), + "KORD", "19990127", " 22:00:00", " 21:56:00", + -0.59, 1.71, 5.1, 0.0, 290.0], + [datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56), + "KORD", "19990127", " 23:00:00", " 22:56:00", + -0.59, 1.71, 4.6, 0.0, 280.0], + ], columns=["X1_X2", "X1_X3", "X0", "X1", "X2", + "X3", "X4", "X5", "X6", "X7", "X8"]) + + if not keep_date_col: + expected = expected.drop(["X1", "X2", "X3"], axis=1) + elif parser.engine == "python": + expected["X1"] = expected["X1"].astype(np.int64) + + tm.assert_frame_equal(result, expected) + + +def test_date_col_as_index_col(all_parsers): + data = """\ +KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), header=None, prefix="X", + parse_dates=[1], index_col=1) + + index = Index([datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 20, 0), + datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 22, 0)], name="X1") + expected = DataFrame([ + ["KORD", " 18:56:00", 0.81, 2.81, 7.2, 0.0, 280.0], + ["KORD", " 19:56:00", 0.01, 2.21, 7.2, 0.0, 260.0], + ["KORD", " 20:56:00", -0.59, 2.21, 5.7, 0.0, 280.0], + ["KORD", " 21:18:00", -0.99, 2.01, 3.6, 0.0, 270.0], + ["KORD", " 21:56:00", -0.59, 1.71, 5.1, 0.0, 290.0], + ], columns=["X0", "X2", "X3", "X4", "X5", "X6", "X7"], index=index) + tm.assert_frame_equal(result, expected) + + +def test_multiple_date_cols_int_cast(all_parsers): + data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" + "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" + "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" + "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" + "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" + "KORD,19990127, 23:00:00, 22:56:00, -0.5900") + parse_dates = {"actual": [1, 2], "nominal": [1, 3]} + parser = all_parsers + + result = parser.read_csv(StringIO(data), header=None, + date_parser=conv.parse_date_time, + parse_dates=parse_dates, prefix="X") + expected = DataFrame([ + [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), + "KORD", 0.81], + [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), + "KORD", 0.01], + [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56), + "KORD", -0.59], + [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18), + "KORD", -0.99], + [datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56), + "KORD", -0.59], + [datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56), + "KORD", -0.59], + ], columns=["actual", "nominal", "X0", "X4"]) + + # Python can sometimes be flaky about how + # the aggregated columns are entered, so + # this standardizes the order. + result = result[expected.columns] + tm.assert_frame_equal(result, expected) + + +def test_multiple_date_col_timestamp_parse(all_parsers): + parser = all_parsers + data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 +05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25""" + + result = parser.read_csv(StringIO(data), parse_dates=[[0, 1]], + header=None, date_parser=Timestamp) + expected = DataFrame([ + [Timestamp("05/31/2012, 15:30:00.029"), + 1306.25, 1, "E", 0, np.nan, 1306.25], + [Timestamp("05/31/2012, 15:30:00.029"), + 1306.25, 8, "E", 0, np.nan, 1306.25] + ], columns=["0_1", 2, 3, 4, 5, 6, 7]) + tm.assert_frame_equal(result, expected) + + +def test_multiple_date_cols_with_header(all_parsers): + parser = all_parsers + data = """\ +ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" + + result = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) + expected = DataFrame([ + [datetime(1999, 1, 27, 19, 0), "KORD", " 18:56:00", + 0.81, 2.81, 7.2, 0.0, 280.0], + [datetime(1999, 1, 27, 20, 0), "KORD", " 19:56:00", + 0.01, 2.21, 7.2, 0.0, 260.0], + [datetime(1999, 1, 27, 21, 0), "KORD", " 20:56:00", + -0.59, 2.21, 5.7, 0.0, 280.0], + [datetime(1999, 1, 27, 21, 0), "KORD", " 21:18:00", + -0.99, 2.01, 3.6, 0.0, 270.0], + [datetime(1999, 1, 27, 22, 0), "KORD", " 21:56:00", + -0.59, 1.71, 5.1, 0.0, 290.0], + [datetime(1999, 1, 27, 23, 0), "KORD", " 22:56:00", + -0.59, 1.71, 4.6, 0.0, 280.0], + ], columns=["nominal", "ID", "ActualTime", "TDew", + "TAir", "Windspeed", "Precip", "WindDir"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("data,parse_dates,msg", [ + ("""\ +date_NominalTime,date,NominalTime +KORD1,19990127, 19:00:00 +KORD2,19990127, 20:00:00""", [[1, 2]], ("New date column already " + "in dict date_NominalTime")), + ("""\ +ID,date,nominalTime +KORD,19990127, 19:00:00 +KORD,19990127, 20:00:00""", dict(ID=[1, 2]), "Date column ID already in dict") +]) +def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg): + parser = all_parsers + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), parse_dates=parse_dates) + + +def test_date_parser_int_bug(all_parsers): + # see gh-3071 + parser = all_parsers + data = ("posix_timestamp,elapsed,sys,user,queries,query_time,rows," + "accountid,userid,contactid,level,silo,method\n" + "1343103150,0.062353,0,4,6,0.01690,3," + "12345,1,-1,3,invoice_InvoiceResource,search\n") + + result = parser.read_csv( + StringIO(data), index_col=0, parse_dates=[0], + date_parser=lambda x: datetime.utcfromtimestamp(int(x))) + expected = DataFrame([[0.062353, 0, 4, 6, 0.01690, 3, 12345, 1, -1, + 3, "invoice_InvoiceResource", "search"]], + columns=["elapsed", "sys", "user", "queries", + "query_time", "rows", "accountid", + "userid", "contactid", "level", + "silo", "method"], + index=Index([Timestamp("2012-07-24 04:12:30")], + name="posix_timestamp")) + tm.assert_frame_equal(result, expected) + + +def test_nat_parse(all_parsers): + # see gh-3062 + parser = all_parsers + df = DataFrame(dict({"A": np.asarray(lrange(10), dtype="float64"), + "B": pd.Timestamp("20010101")})) + df.iloc[3:6, :] = np.nan + + with tm.ensure_clean("__nat_parse_.csv") as path: + df.to_csv(path) + + result = parser.read_csv(path, index_col=0, parse_dates=["B"]) + tm.assert_frame_equal(result, df) + + +def test_csv_custom_parser(all_parsers): + data = """A,B,C +20090101,a,1,2 +20090102,b,3,4 +20090103,c,4,5 +""" + parser = all_parsers + result = parser.read_csv( + StringIO(data), + date_parser=lambda x: datetime.strptime(x, "%Y%m%d")) + expected = parser.read_csv(StringIO(data), parse_dates=True) + tm.assert_frame_equal(result, expected) + + +def test_parse_dates_implicit_first_col(all_parsers): + data = """A,B,C +20090101,a,1,2 +20090102,b,3,4 +20090103,c,4,5 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), parse_dates=True) + + expected = parser.read_csv(StringIO(data), index_col=0, + parse_dates=True) + tm.assert_frame_equal(result, expected) + + +def test_parse_dates_string(all_parsers): + data = """date,A,B,C +20090101,a,1,2 +20090102,b,3,4 +20090103,c,4,5 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col="date", + parse_dates=["date"]) + index = date_range("1/1/2009", periods=3) + index.name = "date" + + expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 4], + "C": [2, 4, 5]}, index=index) + tm.assert_frame_equal(result, expected) + + +# Bug in https://github.com/dateutil/dateutil/issues/217 +# has been addressed, but we just don't pass in the `yearfirst` +@pytest.mark.xfail(reason="yearfirst is not surfaced in read_*") +@pytest.mark.parametrize("parse_dates", [ + [["date", "time"]], + [[0, 1]] +]) +def test_yy_format_with_year_first(all_parsers, parse_dates): + data = """date,time,B,C +090131,0010,1,2 +090228,1020,3,4 +090331,0830,5,6 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=0, + parse_dates=parse_dates) + index = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0), + datetime(2009, 2, 28, 10, 20, 0), + datetime(2009, 3, 31, 8, 30, 0)], + dtype=object, name="date_time") + expected = DataFrame({"B": [1, 3, 5], "C": [2, 4, 6]}, index=index) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("parse_dates", [[0, 2], ["a", "c"]]) +def test_parse_dates_column_list(all_parsers, parse_dates): + data = "a,b,c\n01/01/2010,1,15/02/2010" + parser = all_parsers + + expected = DataFrame({"a": [datetime(2010, 1, 1)], "b": [1], + "c": [datetime(2010, 2, 15)]}) + expected = expected.set_index(["a", "b"]) + + result = parser.read_csv(StringIO(data), index_col=[0, 1], + parse_dates=parse_dates, dayfirst=True) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) +def test_multi_index_parse_dates(all_parsers, index_col): + data = """index1,index2,A,B,C +20090101,one,a,1,2 +20090101,two,b,3,4 +20090101,three,c,4,5 +20090102,one,a,1,2 +20090102,two,b,3,4 +20090102,three,c,4,5 +20090103,one,a,1,2 +20090103,two,b,3,4 +20090103,three,c,4,5 +""" + parser = all_parsers + index = MultiIndex.from_product([ + (datetime(2009, 1, 1), datetime(2009, 1, 2), + datetime(2009, 1, 3)), ("one", "two", "three")], + names=["index1", "index2"]) + + # Out of order. + if index_col == [1, 0]: + index = index.swaplevel(0, 1) + + expected = DataFrame([["a", 1, 2], ["b", 3, 4], ["c", 4, 5], + ["a", 1, 2], ["b", 3, 4], ["c", 4, 5], + ["a", 1, 2], ["b", 3, 4], ["c", 4, 5]], + columns=["A", "B", "C"], index=index) + result = parser.read_csv(StringIO(data), index_col=index_col, + parse_dates=True) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("kwargs", [ + dict(dayfirst=True), dict(day_first=True) +]) +def test_parse_dates_custom_euro_format(all_parsers, kwargs): + parser = all_parsers + data = """foo,bar,baz +31/01/2010,1,2 +01/02/2010,1,NA +02/02/2010,1,2 +""" + if "dayfirst" in kwargs: + df = parser.read_csv(StringIO(data), names=["time", "Q", "NTU"], + date_parser=lambda d: parse_date(d, **kwargs), + header=0, index_col=0, parse_dates=True, + na_values=["NA"]) + exp_index = Index([datetime(2010, 1, 31), datetime(2010, 2, 1), + datetime(2010, 2, 2)], name="time") + expected = DataFrame({"Q": [1, 1, 1], "NTU": [2, np.nan, 2]}, + index=exp_index, columns=["Q", "NTU"]) + tm.assert_frame_equal(df, expected) + else: + msg = "got an unexpected keyword argument 'day_first'" + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), names=["time", "Q", "NTU"], + date_parser=lambda d: parse_date(d, **kwargs), + skiprows=[0], index_col=0, parse_dates=True, + na_values=["NA"]) + + +def test_parse_tz_aware(all_parsers): + # See gh-1693 + parser = all_parsers + data = "Date,x\n2012-06-13T01:39:00Z,0.5" + + result = parser.read_csv(StringIO(data), index_col=0, + parse_dates=True) + expected = DataFrame({"x": [0.5]}, index=Index([Timestamp( + "2012-06-13 01:39:00+00:00")], name="Date")) + tm.assert_frame_equal(result, expected) + assert result.index.tz is pytz.utc + + +@pytest.mark.parametrize("parse_dates,index_col", [ + ({"nominal": [1, 2]}, "nominal"), + ({"nominal": [1, 2]}, 0), + ([[1, 2]], 0), +]) +def test_multiple_date_cols_index(all_parsers, parse_dates, index_col): + parser = all_parsers + data = """ +ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir +KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + expected = DataFrame([ + [datetime(1999, 1, 27, 19, 0), "KORD1", " 18:56:00", + 0.81, 2.81, 7.2, 0.0, 280.0], + [datetime(1999, 1, 27, 20, 0), "KORD2", " 19:56:00", + 0.01, 2.21, 7.2, 0.0, 260.0], + [datetime(1999, 1, 27, 21, 0), "KORD3", " 20:56:00", + -0.59, 2.21, 5.7, 0.0, 280.0], + [datetime(1999, 1, 27, 21, 0), "KORD4", " 21:18:00", + -0.99, 2.01, 3.6, 0.0, 270.0], + [datetime(1999, 1, 27, 22, 0), "KORD5", " 21:56:00", + -0.59, 1.71, 5.1, 0.0, 290.0], + [datetime(1999, 1, 27, 23, 0), "KORD6", " 22:56:00", + -0.59, 1.71, 4.6, 0.0, 280.0], + ], columns=["nominal", "ID", "ActualTime", "TDew", + "TAir", "Windspeed", "Precip", "WindDir"]) + expected = expected.set_index("nominal") + + if not isinstance(parse_dates, dict): + expected.index.name = "date_NominalTime" + + result = parser.read_csv(StringIO(data), parse_dates=parse_dates, + index_col=index_col) + tm.assert_frame_equal(result, expected) + + +def test_multiple_date_cols_chunked(all_parsers): + parser = all_parsers + data = """\ +ID,date,nominalTime,actualTime,A,B,C,D,E +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + + expected = DataFrame([ + [datetime(1999, 1, 27, 19, 0), "KORD", " 18:56:00", + 0.81, 2.81, 7.2, 0.0, 280.0], + [datetime(1999, 1, 27, 20, 0), "KORD", " 19:56:00", + 0.01, 2.21, 7.2, 0.0, 260.0], + [datetime(1999, 1, 27, 21, 0), "KORD", " 20:56:00", + -0.59, 2.21, 5.7, 0.0, 280.0], + [datetime(1999, 1, 27, 21, 0), "KORD", " 21:18:00", + -0.99, 2.01, 3.6, 0.0, 270.0], + [datetime(1999, 1, 27, 22, 0), "KORD", " 21:56:00", + -0.59, 1.71, 5.1, 0.0, 290.0], + [datetime(1999, 1, 27, 23, 0), "KORD", " 22:56:00", + -0.59, 1.71, 4.6, 0.0, 280.0], + ], columns=["nominal", "ID", "actualTime", "A", "B", "C", "D", "E"]) + expected = expected.set_index("nominal") + + reader = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}, + index_col="nominal", chunksize=2) + chunks = list(reader) + + tm.assert_frame_equal(chunks[0], expected[:2]) + tm.assert_frame_equal(chunks[1], expected[2:4]) + tm.assert_frame_equal(chunks[2], expected[4:]) + + +def test_multiple_date_col_named_index_compat(all_parsers): + parser = all_parsers + data = """\ +ID,date,nominalTime,actualTime,A,B,C,D,E +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + + with_indices = parser.read_csv(StringIO(data), + parse_dates={"nominal": [1, 2]}, + index_col="nominal") + with_names = parser.read_csv(StringIO(data), index_col="nominal", + parse_dates={"nominal": [ + "date", "nominalTime"]}) + tm.assert_frame_equal(with_indices, with_names) + + +def test_multiple_date_col_multiple_index_compat(all_parsers): + parser = all_parsers + data = """\ +ID,date,nominalTime,actualTime,A,B,C,D,E +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + result = parser.read_csv(StringIO(data), index_col=["nominal", "ID"], + parse_dates={"nominal": [1, 2]}) + expected = parser.read_csv(StringIO(data), + parse_dates={"nominal": [1, 2]}) + + expected = expected.set_index(["nominal", "ID"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("kwargs", [dict(), dict(index_col="C")]) +def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs): + # see gh-5636 + parser = all_parsers + msg = ("Only booleans, lists, and dictionaries " + "are accepted for the 'parse_dates' parameter") + data = """A,B,C + 1,2,2003-11-1""" + + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), parse_dates="C", **kwargs) + + +@pytest.mark.parametrize("parse_dates", [ + (1,), np.array([4, 5]), {1, 3, 3} +]) +def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates): + parser = all_parsers + msg = ("Only booleans, lists, and dictionaries " + "are accepted for the 'parse_dates' parameter") + data = """A,B,C + 1,2,2003-11-1""" + + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), parse_dates=(1,)) + + +def test_parse_dates_empty_string(all_parsers): + # see gh-2263 + parser = all_parsers + data = "Date,test\n2012-01-01,1\n,2" + result = parser.read_csv(StringIO(data), parse_dates=["Date"], + na_filter=False) + + expected = DataFrame([[datetime(2012, 1, 1), 1], [pd.NaT, 2]], + columns=["Date", "test"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("data,kwargs,expected", [ + ("a\n04.15.2016", dict(parse_dates=["a"]), + DataFrame([datetime(2016, 4, 15)], columns=["a"])), + ("a\n04.15.2016", dict(parse_dates=True, index_col=0), + DataFrame(index=DatetimeIndex(["2016-04-15"], name="a"))), + ("a,b\n04.15.2016,09.16.2013", dict(parse_dates=["a", "b"]), + DataFrame([[datetime(2016, 4, 15), datetime(2013, 9, 16)]], + columns=["a", "b"])), + ("a,b\n04.15.2016,09.16.2013", dict(parse_dates=True, index_col=[0, 1]), + DataFrame(index=MultiIndex.from_tuples( + [(datetime(2016, 4, 15), datetime(2013, 9, 16))], names=["a", "b"]))), +]) +def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected): + # see gh-14066 + parser = all_parsers + + result = parser.read_csv(StringIO(data), thousands=".", **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_parse_date_time_multi_level_column_name(all_parsers): + data = """\ +D,T,A,B +date, time,a,b +2001-01-05, 09:00:00, 0.0, 10. +2001-01-06, 00:00:00, 1.0, 11. +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), header=[0, 1], + parse_dates={"date_time": [0, 1]}, + date_parser=conv.parse_date_time) + + expected_data = [[datetime(2001, 1, 5, 9, 0, 0), 0., 10.], + [datetime(2001, 1, 6, 0, 0, 0), 1., 11.]] + expected = DataFrame(expected_data, + columns=["date_time", ("A", "a"), ("B", "b")]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("data,kwargs,expected", [ + ("""\ +date,time,a,b +2001-01-05, 10:00:00, 0.0, 10. +2001-01-05, 00:00:00, 1., 11. +""", dict(header=0, parse_dates={"date_time": [0, 1]}), + DataFrame([[datetime(2001, 1, 5, 10, 0, 0), 0.0, 10], + [datetime(2001, 1, 5, 0, 0, 0), 1.0, 11.0]], + columns=["date_time", "a", "b"])), + (("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" + "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" + "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" + "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" + "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" + "KORD,19990127, 23:00:00, 22:56:00, -0.5900"), + dict(header=None, parse_dates={"actual": [1, 2], "nominal": [1, 3]}), + DataFrame([ + [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), + "KORD", 0.81], + [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), + "KORD", 0.01], + [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56), + "KORD", -0.59], + [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18), + "KORD", -0.99], + [datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56), + "KORD", -0.59], + [datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56), + "KORD", -0.59]], columns=["actual", "nominal", 0, 4])), +]) +def test_parse_date_time(all_parsers, data, kwargs, expected): + parser = all_parsers + result = parser.read_csv(StringIO(data), date_parser=conv.parse_date_time, + **kwargs) + + # Python can sometimes be flaky about how + # the aggregated columns are entered, so + # this standardizes the order. + result = result[expected.columns] + tm.assert_frame_equal(result, expected) + + +def test_parse_date_fields(all_parsers): + parser = all_parsers + data = ("year,month,day,a\n2001,01,10,10.\n" + "2001,02,1,11.") + result = parser.read_csv(StringIO(data), header=0, + parse_dates={"ymd": [0, 1, 2]}, + date_parser=conv.parse_date_fields) + + expected = DataFrame([[datetime(2001, 1, 10), 10.], + [datetime(2001, 2, 1), 11.]], columns=["ymd", "a"]) + tm.assert_frame_equal(result, expected) + + +def test_parse_date_all_fields(all_parsers): + parser = all_parsers + data = """\ +year,month,day,hour,minute,second,a,b +2001,01,05,10,00,0,0.0,10. +2001,01,5,10,0,00,1.,11. +""" + result = parser.read_csv(StringIO(data), header=0, + date_parser=conv.parse_all_fields, + parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}) + expected = DataFrame([[datetime(2001, 1, 5, 10, 0, 0), 0.0, 10.0], + [datetime(2001, 1, 5, 10, 0, 0), 1.0, 11.0]], + columns=["ymdHMS", "a", "b"]) + tm.assert_frame_equal(result, expected) + + +def test_datetime_fractional_seconds(all_parsers): + parser = all_parsers + data = """\ +year,month,day,hour,minute,second,a,b +2001,01,05,10,00,0.123456,0.0,10. +2001,01,5,10,0,0.500000,1.,11. +""" + result = parser.read_csv(StringIO(data), header=0, + date_parser=conv.parse_all_fields, + parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}) + expected = DataFrame([[datetime(2001, 1, 5, 10, 0, 0, + microsecond=123456), 0.0, 10.0], + [datetime(2001, 1, 5, 10, 0, 0, + microsecond=500000), 1.0, 11.0]], + columns=["ymdHMS", "a", "b"]) + tm.assert_frame_equal(result, expected) + + +def test_generic(all_parsers): + parser = all_parsers + data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." + + result = parser.read_csv(StringIO(data), header=0, + parse_dates={"ym": [0, 1]}, + date_parser=lambda y, m: date(year=int(y), + month=int(m), + day=1)) + expected = DataFrame([[date(2001, 1, 1), 10, 10.], + [date(2001, 2, 1), 1, 11.]], + columns=["ym", "day", "a"]) + tm.assert_frame_equal(result, expected) + + +def test_date_parser_resolution_if_not_ns(all_parsers): + # see gh-10245 + parser = all_parsers + data = """\ +date,time,prn,rxstatus +2013-11-03,19:00:00,126,00E80000 +2013-11-03,19:00:00,23,00E80000 +2013-11-03,19:00:00,13,00E80000 +""" + + def date_parser(dt, time): + return np_array_datetime64_compat(dt + "T" + time + "Z", + dtype="datetime64[s]") + + result = parser.read_csv(StringIO(data), date_parser=date_parser, + parse_dates={"datetime": ["date", "time"]}, + index_col=["datetime", "prn"]) + + datetimes = np_array_datetime64_compat(["2013-11-03T19:00:00Z"] * 3, + dtype="datetime64[s]") + expected = DataFrame(data={"rxstatus": ["00E80000"] * 3}, + index=MultiIndex.from_tuples( + [(datetimes[0], 126), (datetimes[1], 23), + (datetimes[2], 13)], names=["datetime", "prn"])) + tm.assert_frame_equal(result, expected) + + +def test_parse_date_column_with_empty_string(all_parsers): + # see gh-6428 + parser = all_parsers + data = "case,opdate\n7,10/18/2006\n7,10/18/2008\n621, " + result = parser.read_csv(StringIO(data), parse_dates=["opdate"]) + + expected_data = [[7, "10/18/2006"], + [7, "10/18/2008"], + [621, " "]] + expected = DataFrame(expected_data, columns=["case", "opdate"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("data,expected", [ + ("a\n135217135789158401\n1352171357E+5", + DataFrame({"a": [135217135789158401, + 135217135700000]}, dtype="float64")), + ("a\n99999999999\n123456789012345\n1234E+0", + DataFrame({"a": [99999999999, + 123456789012345, + 1234]}, dtype="float64")) +]) +@pytest.mark.parametrize("parse_dates", [True, False]) +def test_parse_date_float(all_parsers, data, expected, parse_dates): + # see gh-2697 + # + # Date parsing should fail, so we leave the data untouched + # (i.e. float precision should remain unchanged). + parser = all_parsers + + result = parser.read_csv(StringIO(data), parse_dates=parse_dates) + tm.assert_frame_equal(result, expected) + + +def test_parse_timezone(all_parsers): + # see gh-22256 + parser = all_parsers + data = """dt,val + 2018-01-04 09:01:00+09:00,23350 + 2018-01-04 09:02:00+09:00,23400 + 2018-01-04 09:03:00+09:00,23400 + 2018-01-04 09:04:00+09:00,23400 + 2018-01-04 09:05:00+09:00,23400""" + result = parser.read_csv(StringIO(data), parse_dates=["dt"]) + + dti = pd.DatetimeIndex(start="2018-01-04 09:01:00", + end="2018-01-04 09:05:00", freq="1min", + tz=pytz.FixedOffset(540)) + expected_data = {"dt": dti, "val": [23350, 23400, 23400, 23400, 23400]} + + expected = DataFrame(expected_data) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_parsers.py b/pandas/tests/io/parser/test_parsers.py index 8c1641b1d2abe..11389a943bea2 100644 --- a/pandas/tests/io/parser/test_parsers.py +++ b/pandas/tests/io/parser/test_parsers.py @@ -12,18 +12,12 @@ import pandas.util.testing as tm from .common import ParserTests -from .multithread import MultithreadTests -from .na_values import NAvaluesTests -from .parse_dates import ParseDatesTests from .python_parser_only import PythonParserTests from .quoting import QuotingTests -from .skiprows import SkipRowsTests from .usecols import UsecolsTests -class BaseParser(MultithreadTests, NAvaluesTests, - ParseDatesTests, ParserTests, - SkipRowsTests, UsecolsTests, +class BaseParser(ParserTests, UsecolsTests, QuotingTests): def read_csv(self, *args, **kwargs): diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py new file mode 100644 index 0000000000000..1df2ca4fad4d8 --- /dev/null +++ b/pandas/tests/io/parser/test_skiprows.py @@ -0,0 +1,222 @@ +# -*- coding: utf-8 -*- + +""" +Tests that skipped rows are properly handled during +parsing for all of the parsers defined in parsers.py +""" + +from datetime import datetime + +import numpy as np +import pytest + +from pandas.compat import StringIO, lrange, range +from pandas.errors import EmptyDataError + +from pandas import DataFrame, Index +import pandas.util.testing as tm + + +@pytest.mark.parametrize("skiprows", [lrange(6), 6]) +def test_skip_rows_bug(all_parsers, skiprows): + # see gh-505 + parser = all_parsers + text = """#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +1/1/2000,1.,2.,3. +1/2/2000,4,5,6 +1/3/2000,7,8,9 +""" + result = parser.read_csv(StringIO(text), skiprows=skiprows, header=None, + index_col=0, parse_dates=True) + index = Index([datetime(2000, 1, 1), datetime(2000, 1, 2), + datetime(2000, 1, 3)], name=0) + + expected = DataFrame(np.arange(1., 10.).reshape((3, 3)), + columns=[1, 2, 3], index=index) + tm.assert_frame_equal(result, expected) + + +def test_deep_skip_rows(all_parsers): + # see gh-4382 + parser = all_parsers + data = "a,b,c\n" + "\n".join([",".join([str(i), str(i + 1), str(i + 2)]) + for i in range(10)]) + condensed_data = "a,b,c\n" + "\n".join([ + ",".join([str(i), str(i + 1), str(i + 2)]) + for i in [0, 1, 2, 3, 4, 6, 8, 9]]) + + result = parser.read_csv(StringIO(data), skiprows=[6, 8]) + condensed_result = parser.read_csv(StringIO(condensed_data)) + tm.assert_frame_equal(result, condensed_result) + + +def test_skip_rows_blank(all_parsers): + # see gh-9832 + parser = all_parsers + text = """#foo,a,b,c +#foo,a,b,c + +#foo,a,b,c +#foo,a,b,c + +1/1/2000,1.,2.,3. +1/2/2000,4,5,6 +1/3/2000,7,8,9 +""" + data = parser.read_csv(StringIO(text), skiprows=6, header=None, + index_col=0, parse_dates=True) + index = Index([datetime(2000, 1, 1), datetime(2000, 1, 2), + datetime(2000, 1, 3)], name=0) + + expected = DataFrame(np.arange(1., 10.).reshape((3, 3)), + columns=[1, 2, 3], + index=index) + tm.assert_frame_equal(data, expected) + + +@pytest.mark.parametrize("data,kwargs,expected", [ + ("""id,text,num_lines +1,"line 11 +line 12",2 +2,"line 21 +line 22",2 +3,"line 31",1""", + dict(skiprows=[1]), + DataFrame([[2, "line 21\nline 22", 2], + [3, "line 31", 1]], columns=["id", "text", "num_lines"])), + ("a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~", + dict(quotechar="~", skiprows=[2]), + DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"])), + (("Text,url\n~example\n " + "sentence\n one~,url1\n~" + "example\n sentence\n two~,url2\n~" + "example\n sentence\n three~,url3"), + dict(quotechar="~", skiprows=[1, 3]), + DataFrame([['example\n sentence\n two', 'url2']], + columns=["Text", "url"])) +]) +def test_skip_row_with_newline(all_parsers, data, kwargs, expected): + # see gh-12775 and gh-10911 + parser = all_parsers + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_skip_row_with_quote(all_parsers): + # see gh-12775 and gh-10911 + parser = all_parsers + data = """id,text,num_lines +1,"line '11' line 12",2 +2,"line '21' line 22",2 +3,"line '31' line 32",1""" + + exp_data = [[2, "line '21' line 22", 2], + [3, "line '31' line 32", 1]] + expected = DataFrame(exp_data, columns=[ + "id", "text", "num_lines"]) + + result = parser.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("data,exp_data", [ + ("""id,text,num_lines +1,"line \n'11' line 12",2 +2,"line \n'21' line 22",2 +3,"line \n'31' line 32",1""", + [[2, "line \n'21' line 22", 2], + [3, "line \n'31' line 32", 1]]), + ("""id,text,num_lines +1,"line '11\n' line 12",2 +2,"line '21\n' line 22",2 +3,"line '31\n' line 32",1""", + [[2, "line '21\n' line 22", 2], + [3, "line '31\n' line 32", 1]]), + ("""id,text,num_lines +1,"line '11\n' \r\tline 12",2 +2,"line '21\n' \r\tline 22",2 +3,"line '31\n' \r\tline 32",1""", + [[2, "line '21\n' \r\tline 22", 2], + [3, "line '31\n' \r\tline 32", 1]]), +]) +def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data): + # see gh-12775 and gh-10911 + parser = all_parsers + result = parser.read_csv(StringIO(data), skiprows=[1]) + + expected = DataFrame(exp_data, columns=["id", "text", "num_lines"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("line_terminator", [ + "\n", # "LF" + "\r\n", # "CRLF" + "\r" # "CR" +]) +def test_skiprows_lineterminator(all_parsers, line_terminator): + # see gh-9079 + parser = all_parsers + data = "\n".join(["SMOSMANIA ThetaProbe-ML2X ", + "2007/01/01 01:00 0.2140 U M ", + "2007/01/01 02:00 0.2141 M O ", + "2007/01/01 04:00 0.2142 D M "]) + expected = DataFrame([["2007/01/01", "01:00", 0.2140, "U", "M"], + ["2007/01/01", "02:00", 0.2141, "M", "O"], + ["2007/01/01", "04:00", 0.2142, "D", "M"]], + columns=["date", "time", "var", "flag", + "oflag"]) + + if parser.engine == "python" and line_terminator == "\r": + pytest.skip("'CR' not respect with the Python parser yet") + + data = data.replace("\n", line_terminator) + result = parser.read_csv(StringIO(data), skiprows=1, delim_whitespace=True, + names=["date", "time", "var", "flag", "oflag"]) + tm.assert_frame_equal(result, expected) + + +def test_skiprows_infield_quote(all_parsers): + # see gh-14459 + parser = all_parsers + data = "a\"\nb\"\na\n1" + expected = DataFrame({"a": [1]}) + + result = parser.read_csv(StringIO(data), skiprows=2) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("kwargs,expected", [ + (dict(), DataFrame({"1": [3, 5]})), + (dict(header=0, names=["foo"]), DataFrame({"foo": [3, 5]})) +]) +def test_skip_rows_callable(all_parsers, kwargs, expected): + parser = all_parsers + data = "a\n1\n2\n3\n4\n5" + + result = parser.read_csv(StringIO(data), + skiprows=lambda x: x % 2 == 0, + **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_skip_rows_skip_all(all_parsers): + parser = all_parsers + data = "a\n1\n2\n3\n4\n5" + msg = "No columns to parse from file" + + with pytest.raises(EmptyDataError, match=msg): + parser.read_csv(StringIO(data), skiprows=lambda x: True) + + +def test_skip_rows_bad_callable(all_parsers): + msg = "by zero" + parser = all_parsers + data = "a\n1\n2\n3\n4\n5" + + with pytest.raises(ZeroDivisionError, match=msg): + parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0) diff --git a/pandas/tests/io/test_date_converters.py b/pandas/tests/io/test_date_converters.py new file mode 100644 index 0000000000000..c5a94883aa609 --- /dev/null +++ b/pandas/tests/io/test_date_converters.py @@ -0,0 +1,43 @@ +from datetime import datetime + +import numpy as np + +import pandas.util.testing as tm + +import pandas.io.date_converters as conv + + +def test_parse_date_time(): + dates = np.array(['2007/1/3', '2008/2/4'], dtype=object) + times = np.array(['05:07:09', '06:08:00'], dtype=object) + expected = np.array([datetime(2007, 1, 3, 5, 7, 9), + datetime(2008, 2, 4, 6, 8, 0)]) + + result = conv.parse_date_time(dates, times) + tm.assert_numpy_array_equal(result, expected) + + +def test_parse_date_fields(): + days = np.array([3, 4]) + months = np.array([1, 2]) + years = np.array([2007, 2008]) + result = conv.parse_date_fields(years, months, days) + + expected = np.array([datetime(2007, 1, 3), datetime(2008, 2, 4)]) + tm.assert_numpy_array_equal(result, expected) + + +def test_parse_all_fields(): + hours = np.array([5, 6]) + minutes = np.array([7, 8]) + seconds = np.array([9, 0]) + + days = np.array([3, 4]) + years = np.array([2007, 2008]) + months = np.array([1, 2]) + + result = conv.parse_all_fields(years, months, days, + hours, minutes, seconds) + expected = np.array([datetime(2007, 1, 3, 5, 7, 9), + datetime(2008, 2, 4, 6, 8, 0)]) + tm.assert_numpy_array_equal(result, expected)