From 2448f48e38b8c76bbad8c902333107c75cf06791 Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 19:15:00 +0000 Subject: [PATCH 01/64] BUG: Changes types used in packing structs Types used for integer conversion where always half the size they should be. Produced a bug when exporting data tables with long integer data (np.int64). --- pandas/io/stata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 55bcbd76c2248..cd5c443b3d6a4 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -258,7 +258,7 @@ def __init__(self, encoding): (65530, np.int16) ] ) - self.TYPE_MAP = lrange(251) + list('bhlfd') + self.TYPE_MAP = lrange(251) + list('hlqfd') self.TYPE_MAP_XML = \ dict( [ From eeded1f0dfc357fe45da6039deb8172af082b986 Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 19:37:21 +0000 Subject: [PATCH 02/64] Added test for integer conversion bug Added test for incorrect integer conversion from int16, int32 and int64 --- pandas/io/tests/test_stata.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 1640bee7a9929..cd46bea1d45ed 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -272,6 +272,20 @@ def test_read_write_dta12(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) + + def test_read_write_dta13(self): + s1 = Series(2**9,dtype=np.int16) + s2 = Series(2**17,dtype=np.int32) + s3 = Series(2**33,dtype=np.int64) + original = DataFrame({'short':s1,'int':s2,'long':s3}) + original.index.name = 'index' + + with tm.ensure_clean() as path: + original.to_stata(path) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), + original) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From fba45720bd8bf63a43d6dfacb44f51c2ee81b255 Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 23:27:55 +0000 Subject: [PATCH 03/64] Removed unintended whitespace --- pandas/io/tests/test_stata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index cd46bea1d45ed..fcc6311a08c0b 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -280,12 +280,12 @@ def test_read_write_dta13(self): original = DataFrame({'short':s1,'int':s2,'long':s3}) original.index.name = 'index' - with tm.ensure_clean() as path: + with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), original) - + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From 50f579f6aff5a94e5181698314a38f0a0a05fdb6 Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 23:35:52 +0000 Subject: [PATCH 04/64] Fixed another typo --- pandas/io/tests/test_stata.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index fcc6311a08c0b..db268d20af048 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -273,13 +273,13 @@ def test_read_write_dta12(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) - def test_read_write_dta13(self): + def test_read_write_dta13(self): s1 = Series(2**9,dtype=np.int16) s2 = Series(2**17,dtype=np.int32) s3 = Series(2**33,dtype=np.int64) original = DataFrame({'short':s1,'int':s2,'long':s3}) original.index.name = 'index' - + with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) @@ -287,6 +287,7 @@ def test_read_write_dta13(self): original) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) From 77939e0f5e54b410eddcd3eb29f46044dd42a816 Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 19:15:00 +0000 Subject: [PATCH 05/64] BUG: Changes types used in packing structs Types used for integer conversion where always half the size they should be. Produced a bug when exporting data tables with long integer data (np.int64). --- pandas/io/stata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 55bcbd76c2248..cd5c443b3d6a4 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -258,7 +258,7 @@ def __init__(self, encoding): (65530, np.int16) ] ) - self.TYPE_MAP = lrange(251) + list('bhlfd') + self.TYPE_MAP = lrange(251) + list('hlqfd') self.TYPE_MAP_XML = \ dict( [ From fa7faff15c2542c1d8ed13f4fad1308e945dfd8b Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 19:37:21 +0000 Subject: [PATCH 06/64] Added test for integer conversion bug Added test for incorrect integer conversion from int16, int32 and int64 --- pandas/io/tests/test_stata.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 1640bee7a9929..cd46bea1d45ed 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -272,6 +272,20 @@ def test_read_write_dta12(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) + + def test_read_write_dta13(self): + s1 = Series(2**9,dtype=np.int16) + s2 = Series(2**17,dtype=np.int32) + s3 = Series(2**33,dtype=np.int64) + original = DataFrame({'short':s1,'int':s2,'long':s3}) + original.index.name = 'index' + + with tm.ensure_clean() as path: + original.to_stata(path) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), + original) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From 26b5cdcf871c4d7c49a65a659f954507185c697f Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 23:27:55 +0000 Subject: [PATCH 07/64] Removed unintended whitespace --- pandas/io/tests/test_stata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index cd46bea1d45ed..fcc6311a08c0b 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -280,12 +280,12 @@ def test_read_write_dta13(self): original = DataFrame({'short':s1,'int':s2,'long':s3}) original.index.name = 'index' - with tm.ensure_clean() as path: + with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), original) - + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From a572356ece8a9d4d0d964975650f81658de13242 Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 23:35:52 +0000 Subject: [PATCH 08/64] Fixed another typo --- pandas/io/tests/test_stata.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index fcc6311a08c0b..db268d20af048 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -273,13 +273,13 @@ def test_read_write_dta12(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) - def test_read_write_dta13(self): + def test_read_write_dta13(self): s1 = Series(2**9,dtype=np.int16) s2 = Series(2**17,dtype=np.int32) s3 = Series(2**33,dtype=np.int64) original = DataFrame({'short':s1,'int':s2,'long':s3}) original.index.name = 'index' - + with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) @@ -287,6 +287,7 @@ def test_read_write_dta13(self): original) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) From 467a84f6d2561a67f957629c7e982476ccf9d33f Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Wed, 26 Feb 2014 13:52:03 +0000 Subject: [PATCH 09/64] FIX: Corrected incorrect data type conversion between pandas and Stata FIX: Remove unnecessary, potentially precision degrading cast to Series when writing data ENH: Added function to cast columns from NumPy data types to Stata data types FIX: Corrected tests for correct Stata datatypes --- pandas/io/stata.py | 58 +++++++++++++++++++++++++---------- pandas/io/tests/test_stata.py | 11 +++++-- 2 files changed, 51 insertions(+), 18 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index cd5c443b3d6a4..720461f23def3 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -175,6 +175,26 @@ def _datetime_to_stata_elapsed(date, fmt): raise ValueError("fmt %s not understood" % fmt) +def _cast_to_stata_types(data): + for col in data: + dtype = data[col].dtype + if dtype==np.int8: + if data[col].max() > 100 or data[col].min() < -127: + data[col] = data[col].astype(np.int16) + elif dtype==np.int16: + if data[col].max() > 32740 or data[col].min() < -32767: + data[col] = data[col].astype(np.int32) + elif dtype==np.int64: + if data[col].max() <= 2147483620 and data[col].min() >= -2147483647: + data[col] = data[col].astype(np.int32) + else: + data[col] = data[col].astype(np.float64) + if data[col].max() <= 2*53 or data[col].min() >= -2**53: + from warnings import warn + warn("int64 data out of range for float64, data possibly truncated.") + + return data + class StataMissingValue(StringMixin): """ An observation's missing value. @@ -240,9 +260,9 @@ def __init__(self, encoding): dict( lzip(range(1, 245), ['a' + str(i) for i in range(1, 245)]) + [ - (251, np.int16), - (252, np.int32), - (253, np.int64), + (251, np.int8), + (252, np.int16), + (253, np.int32), (254, np.float32), (255, np.float64) ] @@ -253,12 +273,12 @@ def __init__(self, encoding): (32768, np.string_), (65526, np.float64), (65527, np.float32), - (65528, np.int64), - (65529, np.int32), - (65530, np.int16) + (65528, np.int32), + (65529, np.int16), + (65530, np.int8) ] ) - self.TYPE_MAP = lrange(251) + list('hlqfd') + self.TYPE_MAP = lrange(251) + list('bhlfd') self.TYPE_MAP_XML = \ dict( [ @@ -855,11 +875,12 @@ def _dtype_to_stata_type(dtype): See TYPE_MAP and comments for an explanation. This is also explained in the dta spec. 1 - 244 are strings of this length - 251 - chr(251) - for int8 and int16, byte - 252 - chr(252) - for int32, int - 253 - chr(253) - for int64, long - 254 - chr(254) - for float32, float - 255 - chr(255) - double, double + Pandas Stata + 251 - chr(251) - for int8 byte + 252 - chr(252) - for int16 int + 253 - chr(253) - for int32 long + 254 - chr(254) - for float32 float + 255 - chr(255) - for double double If there are dates to convert, then dtype will already have the correct type inserted. @@ -878,8 +899,10 @@ def _dtype_to_stata_type(dtype): elif dtype == np.int64: return chr(253) elif dtype == np.int32: + return chr(253) + elif dtype == np.int16: return chr(252) - elif dtype == np.int8 or dtype == np.int16: + elif dtype == np.int8: return chr(251) else: # pragma : no cover raise ValueError("Data type %s not currently understood. " @@ -970,7 +993,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True, self._file = _open_file_binary_write( fname, self._encoding or self._default_encoding ) - self.type_converters = {253: np.long, 252: int} + self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8} def _write(self, to_write): """ @@ -990,11 +1013,14 @@ def __init__(self, data): self.data = data def __iter__(self): - for i, row in data.iterrows(): - yield row + for row in data.itertuples(): + # First element is index, so remove + yield row[1:] if self._write_index: data = data.reset_index() + # Check columns for compatbaility with stata + data = _cast_to_stata_types(data) self.datarows = DataFrameRowIter(data) self.nobs, self.nvar = data.shape self.data = data diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index db268d20af048..c5a47f0b55621 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -128,8 +128,8 @@ def test_read_dta3(self): # match stata here expected = self.read_csv(self.csv3) expected = expected.astype(np.float32) - expected['year'] = expected['year'].astype(np.int32) - expected['quarter'] = expected['quarter'].astype(np.int16) + expected['year'] = expected['year'].astype(np.int16) + expected['quarter'] = expected['quarter'].astype(np.int8) tm.assert_frame_equal(parsed, expected) tm.assert_frame_equal(parsed_13, expected) @@ -175,6 +175,9 @@ def test_write_dta6(self): original = self.read_csv(self.csv3) original.index.name = 'index' + original.index = original.index.astype(np.int32) + original['year'] = original['year'].astype(np.int32) + original['quarter'] = original['quarter'].astype(np.int32) with tm.ensure_clean() as path: original.to_stata(path, None, False) @@ -209,6 +212,8 @@ def test_read_write_dta10(self): 'datetime']) original["object"] = Series(original["object"], dtype=object) original.index.name = 'index' + original.index = original.index.astype(np.int32) + original['integer'] = original['integer'].astype(np.int32) with tm.ensure_clean() as path: original.to_stata(path, {'datetime': 'tc'}, False) @@ -245,6 +250,7 @@ def test_read_write_dta11(self): formatted = DataFrame([(1, 2, 3, 4)], columns=['good', 'b_d', '_8number', 'astringwithmorethan32characters_']) formatted.index.name = 'index' + formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: @@ -263,6 +269,7 @@ def test_read_write_dta12(self): formatted = DataFrame([(1, 2, 3, 4)], columns=['astringwithmorethan32characters_', '_0astringwithmorethan32character', '_', '_1_']) formatted.index.name = 'index' + formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: From 174aca3c20622efb61e3650b367a736dd52bb533 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Wed, 26 Feb 2014 13:58:22 +0000 Subject: [PATCH 10/64] Removed unintended branch merge --- pandas/io/stata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index b9f2e7f5fc402..720461f23def3 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -278,7 +278,7 @@ def __init__(self, encoding): (65530, np.int8) ] ) - self.TYPE_MAP = lrange(251) + list('hlqfd') + self.TYPE_MAP = lrange(251) + list('bhlfd') self.TYPE_MAP_XML = \ dict( [ From 89fb3c0d7fdc6e200e6be3fdae9735315c1429d4 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Wed, 26 Feb 2014 14:34:21 +0000 Subject: [PATCH 11/64] Fixed formatting in comparison after casting --- pandas/io/tests/test_stata.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index c5a47f0b55621..76783147b9c32 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -284,14 +284,17 @@ def test_read_write_dta13(self): s1 = Series(2**9,dtype=np.int16) s2 = Series(2**17,dtype=np.int32) s3 = Series(2**33,dtype=np.int64) - original = DataFrame({'short':s1,'int':s2,'long':s3}) + original = DataFrame({'int16':s1,'int32':s2,'int64':s3}) original.index.name = 'index' + formatted = original + formatted['int64'] = formatted['int64'].astype(np.float64) + with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), - original) + formatted) From 630877630dca4cd8722d73d0b0696addc1ffe39b Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 26 Feb 2014 19:01:34 +0000 Subject: [PATCH 12/64] Added docstring for new function and warning class --- pandas/io/stata.py | 48 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 720461f23def3..2882a7c42ba1c 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -175,26 +175,62 @@ def _datetime_to_stata_elapsed(date, fmt): raise ValueError("fmt %s not understood" % fmt) +class PossiblePrecisionLoss(Warning): + pass + + +precision_loss_doc = """ +Column converted from %s to %s, and some data are outside of the lossless +conversion range. This may result in a loss of precision in the saved data. +""" + + def _cast_to_stata_types(data): + """Checks the dtypes of the columns of a pandas DataFrame for + compatibility with the data types and ranges supported by Stata, and + converts if necessary. + + Parameters + ---------- + data : DataFrame + The DataFrame to check and convert + + Notes + ----- + Numeric columns must be one of int8, int16, int32, float32 or float64, with + some additional value restrictions on the integer data types. int8 and + int16 columns are checked for violations of the value restrictions and + upcast if needed. int64 data is not usable in Stata, and so it is + downcast to int32 whenever the value are in the int32 range, and + sidecast to float64 when larger than this range. If the int64 values + are outside of the range of those perfectly representable as float64 values, + a warning is raised. + """ + ws = '' for col in data: dtype = data[col].dtype - if dtype==np.int8: + if dtype == np.int8: if data[col].max() > 100 or data[col].min() < -127: data[col] = data[col].astype(np.int16) - elif dtype==np.int16: + elif dtype == np.int16: if data[col].max() > 32740 or data[col].min() < -32767: data[col] = data[col].astype(np.int32) - elif dtype==np.int64: + elif dtype == np.int64: if data[col].max() <= 2147483620 and data[col].min() >= -2147483647: data[col] = data[col].astype(np.int32) else: data[col] = data[col].astype(np.float64) - if data[col].max() <= 2*53 or data[col].min() >= -2**53: - from warnings import warn - warn("int64 data out of range for float64, data possibly truncated.") + if data[col].max() <= 2 * 53 or data[col].min() >= -2 ** 53: + ws = precision_loss_doc % ('int64', 'float64') + + if ws: + import warnings + + warnings.warn(ws, PossiblePrecisionLoss) return data + class StataMissingValue(StringMixin): """ An observation's missing value. From 4e65c2529d1517d621a944e04c16474f785e491a Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 19:15:00 +0000 Subject: [PATCH 13/64] BUG: Changes types used in packing structs Types used for integer conversion where always half the size they should be. Produced a bug when exporting data tables with long integer data (np.int64). --- pandas/io/stata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 55bcbd76c2248..cd5c443b3d6a4 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -258,7 +258,7 @@ def __init__(self, encoding): (65530, np.int16) ] ) - self.TYPE_MAP = lrange(251) + list('bhlfd') + self.TYPE_MAP = lrange(251) + list('hlqfd') self.TYPE_MAP_XML = \ dict( [ From 6b996430fdb2343c15303a466f20fd5475359c5c Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 19:37:21 +0000 Subject: [PATCH 14/64] Added test for integer conversion bug Added test for incorrect integer conversion from int16, int32 and int64 --- pandas/io/tests/test_stata.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 1640bee7a9929..cd46bea1d45ed 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -272,6 +272,20 @@ def test_read_write_dta12(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) + + def test_read_write_dta13(self): + s1 = Series(2**9,dtype=np.int16) + s2 = Series(2**17,dtype=np.int32) + s3 = Series(2**33,dtype=np.int64) + original = DataFrame({'short':s1,'int':s2,'long':s3}) + original.index.name = 'index' + + with tm.ensure_clean() as path: + original.to_stata(path) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), + original) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From bfed97b271d1d438934ff53f65dcc125edf84644 Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 23:27:55 +0000 Subject: [PATCH 15/64] Removed unintended whitespace --- pandas/io/tests/test_stata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index cd46bea1d45ed..fcc6311a08c0b 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -280,12 +280,12 @@ def test_read_write_dta13(self): original = DataFrame({'short':s1,'int':s2,'long':s3}) original.index.name = 'index' - with tm.ensure_clean() as path: + with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), original) - + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From 144516a313b18b44766e61dcf5c799bf97c6ea65 Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 23:35:52 +0000 Subject: [PATCH 16/64] Fixed another typo --- pandas/io/tests/test_stata.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index fcc6311a08c0b..db268d20af048 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -273,13 +273,13 @@ def test_read_write_dta12(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) - def test_read_write_dta13(self): + def test_read_write_dta13(self): s1 = Series(2**9,dtype=np.int16) s2 = Series(2**17,dtype=np.int32) s3 = Series(2**33,dtype=np.int64) original = DataFrame({'short':s1,'int':s2,'long':s3}) original.index.name = 'index' - + with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) @@ -287,6 +287,7 @@ def test_read_write_dta13(self): original) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) From f4eb1387aed46bd0b3790162e5f012cebd8c33dd Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Wed, 26 Feb 2014 13:52:03 +0000 Subject: [PATCH 17/64] FIX: Corrected incorrect data type conversion between pandas and Stata FIX: Remove unnecessary, potentially precision degrading cast to Series when writing data ENH: Added function to cast columns from NumPy data types to Stata data types FIX: Corrected tests for correct Stata datatypes --- pandas/io/stata.py | 58 +++++++++++++++++++++++++---------- pandas/io/tests/test_stata.py | 11 +++++-- 2 files changed, 51 insertions(+), 18 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index cd5c443b3d6a4..720461f23def3 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -175,6 +175,26 @@ def _datetime_to_stata_elapsed(date, fmt): raise ValueError("fmt %s not understood" % fmt) +def _cast_to_stata_types(data): + for col in data: + dtype = data[col].dtype + if dtype==np.int8: + if data[col].max() > 100 or data[col].min() < -127: + data[col] = data[col].astype(np.int16) + elif dtype==np.int16: + if data[col].max() > 32740 or data[col].min() < -32767: + data[col] = data[col].astype(np.int32) + elif dtype==np.int64: + if data[col].max() <= 2147483620 and data[col].min() >= -2147483647: + data[col] = data[col].astype(np.int32) + else: + data[col] = data[col].astype(np.float64) + if data[col].max() <= 2*53 or data[col].min() >= -2**53: + from warnings import warn + warn("int64 data out of range for float64, data possibly truncated.") + + return data + class StataMissingValue(StringMixin): """ An observation's missing value. @@ -240,9 +260,9 @@ def __init__(self, encoding): dict( lzip(range(1, 245), ['a' + str(i) for i in range(1, 245)]) + [ - (251, np.int16), - (252, np.int32), - (253, np.int64), + (251, np.int8), + (252, np.int16), + (253, np.int32), (254, np.float32), (255, np.float64) ] @@ -253,12 +273,12 @@ def __init__(self, encoding): (32768, np.string_), (65526, np.float64), (65527, np.float32), - (65528, np.int64), - (65529, np.int32), - (65530, np.int16) + (65528, np.int32), + (65529, np.int16), + (65530, np.int8) ] ) - self.TYPE_MAP = lrange(251) + list('hlqfd') + self.TYPE_MAP = lrange(251) + list('bhlfd') self.TYPE_MAP_XML = \ dict( [ @@ -855,11 +875,12 @@ def _dtype_to_stata_type(dtype): See TYPE_MAP and comments for an explanation. This is also explained in the dta spec. 1 - 244 are strings of this length - 251 - chr(251) - for int8 and int16, byte - 252 - chr(252) - for int32, int - 253 - chr(253) - for int64, long - 254 - chr(254) - for float32, float - 255 - chr(255) - double, double + Pandas Stata + 251 - chr(251) - for int8 byte + 252 - chr(252) - for int16 int + 253 - chr(253) - for int32 long + 254 - chr(254) - for float32 float + 255 - chr(255) - for double double If there are dates to convert, then dtype will already have the correct type inserted. @@ -878,8 +899,10 @@ def _dtype_to_stata_type(dtype): elif dtype == np.int64: return chr(253) elif dtype == np.int32: + return chr(253) + elif dtype == np.int16: return chr(252) - elif dtype == np.int8 or dtype == np.int16: + elif dtype == np.int8: return chr(251) else: # pragma : no cover raise ValueError("Data type %s not currently understood. " @@ -970,7 +993,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True, self._file = _open_file_binary_write( fname, self._encoding or self._default_encoding ) - self.type_converters = {253: np.long, 252: int} + self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8} def _write(self, to_write): """ @@ -990,11 +1013,14 @@ def __init__(self, data): self.data = data def __iter__(self): - for i, row in data.iterrows(): - yield row + for row in data.itertuples(): + # First element is index, so remove + yield row[1:] if self._write_index: data = data.reset_index() + # Check columns for compatbaility with stata + data = _cast_to_stata_types(data) self.datarows = DataFrameRowIter(data) self.nobs, self.nvar = data.shape self.data = data diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index db268d20af048..c5a47f0b55621 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -128,8 +128,8 @@ def test_read_dta3(self): # match stata here expected = self.read_csv(self.csv3) expected = expected.astype(np.float32) - expected['year'] = expected['year'].astype(np.int32) - expected['quarter'] = expected['quarter'].astype(np.int16) + expected['year'] = expected['year'].astype(np.int16) + expected['quarter'] = expected['quarter'].astype(np.int8) tm.assert_frame_equal(parsed, expected) tm.assert_frame_equal(parsed_13, expected) @@ -175,6 +175,9 @@ def test_write_dta6(self): original = self.read_csv(self.csv3) original.index.name = 'index' + original.index = original.index.astype(np.int32) + original['year'] = original['year'].astype(np.int32) + original['quarter'] = original['quarter'].astype(np.int32) with tm.ensure_clean() as path: original.to_stata(path, None, False) @@ -209,6 +212,8 @@ def test_read_write_dta10(self): 'datetime']) original["object"] = Series(original["object"], dtype=object) original.index.name = 'index' + original.index = original.index.astype(np.int32) + original['integer'] = original['integer'].astype(np.int32) with tm.ensure_clean() as path: original.to_stata(path, {'datetime': 'tc'}, False) @@ -245,6 +250,7 @@ def test_read_write_dta11(self): formatted = DataFrame([(1, 2, 3, 4)], columns=['good', 'b_d', '_8number', 'astringwithmorethan32characters_']) formatted.index.name = 'index' + formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: @@ -263,6 +269,7 @@ def test_read_write_dta12(self): formatted = DataFrame([(1, 2, 3, 4)], columns=['astringwithmorethan32characters_', '_0astringwithmorethan32character', '_', '_1_']) formatted.index.name = 'index' + formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: From faae4a0e2f4a949ca9f989b21525fc623ba0c118 Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 19:15:00 +0000 Subject: [PATCH 18/64] BUG: Changes types used in packing structs Types used for integer conversion where always half the size they should be. Produced a bug when exporting data tables with long integer data (np.int64). --- pandas/io/stata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 720461f23def3..b9f2e7f5fc402 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -278,7 +278,7 @@ def __init__(self, encoding): (65530, np.int8) ] ) - self.TYPE_MAP = lrange(251) + list('bhlfd') + self.TYPE_MAP = lrange(251) + list('hlqfd') self.TYPE_MAP_XML = \ dict( [ From de11ef9e1d7d8b7f6d8fa1c9af1dd87c5748aaf1 Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 19:37:21 +0000 Subject: [PATCH 19/64] Added test for integer conversion bug Added test for incorrect integer conversion from int16, int32 and int64 --- pandas/io/tests/test_stata.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index c5a47f0b55621..cd46bea1d45ed 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -128,8 +128,8 @@ def test_read_dta3(self): # match stata here expected = self.read_csv(self.csv3) expected = expected.astype(np.float32) - expected['year'] = expected['year'].astype(np.int16) - expected['quarter'] = expected['quarter'].astype(np.int8) + expected['year'] = expected['year'].astype(np.int32) + expected['quarter'] = expected['quarter'].astype(np.int16) tm.assert_frame_equal(parsed, expected) tm.assert_frame_equal(parsed_13, expected) @@ -175,9 +175,6 @@ def test_write_dta6(self): original = self.read_csv(self.csv3) original.index.name = 'index' - original.index = original.index.astype(np.int32) - original['year'] = original['year'].astype(np.int32) - original['quarter'] = original['quarter'].astype(np.int32) with tm.ensure_clean() as path: original.to_stata(path, None, False) @@ -212,8 +209,6 @@ def test_read_write_dta10(self): 'datetime']) original["object"] = Series(original["object"], dtype=object) original.index.name = 'index' - original.index = original.index.astype(np.int32) - original['integer'] = original['integer'].astype(np.int32) with tm.ensure_clean() as path: original.to_stata(path, {'datetime': 'tc'}, False) @@ -250,7 +245,6 @@ def test_read_write_dta11(self): formatted = DataFrame([(1, 2, 3, 4)], columns=['good', 'b_d', '_8number', 'astringwithmorethan32characters_']) formatted.index.name = 'index' - formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: @@ -269,7 +263,6 @@ def test_read_write_dta12(self): formatted = DataFrame([(1, 2, 3, 4)], columns=['astringwithmorethan32characters_', '_0astringwithmorethan32character', '_', '_1_']) formatted.index.name = 'index' - formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: @@ -280,20 +273,19 @@ def test_read_write_dta12(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) - def test_read_write_dta13(self): + def test_read_write_dta13(self): s1 = Series(2**9,dtype=np.int16) s2 = Series(2**17,dtype=np.int32) s3 = Series(2**33,dtype=np.int64) original = DataFrame({'short':s1,'int':s2,'long':s3}) original.index.name = 'index' - - with tm.ensure_clean() as path: + + with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), original) - - + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From 4e0df964dc20f192c16a59a5946747ab5c41203b Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 23:27:55 +0000 Subject: [PATCH 20/64] Removed unintended whitespace --- pandas/io/tests/test_stata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index cd46bea1d45ed..fcc6311a08c0b 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -280,12 +280,12 @@ def test_read_write_dta13(self): original = DataFrame({'short':s1,'int':s2,'long':s3}) original.index.name = 'index' - with tm.ensure_clean() as path: + with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), original) - + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From c4cff55447592edc4364262e036c4cc40027c938 Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 23:35:52 +0000 Subject: [PATCH 21/64] Fixed another typo --- pandas/io/tests/test_stata.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index fcc6311a08c0b..db268d20af048 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -273,13 +273,13 @@ def test_read_write_dta12(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) - def test_read_write_dta13(self): + def test_read_write_dta13(self): s1 = Series(2**9,dtype=np.int16) s2 = Series(2**17,dtype=np.int32) s3 = Series(2**33,dtype=np.int64) original = DataFrame({'short':s1,'int':s2,'long':s3}) original.index.name = 'index' - + with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) @@ -287,6 +287,7 @@ def test_read_write_dta13(self): original) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) From afee2dc073afe1820d9f89f9dc05d91bb745af50 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Wed, 26 Feb 2014 13:58:22 +0000 Subject: [PATCH 22/64] Removed unintended branch merge --- pandas/io/stata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index b9f2e7f5fc402..720461f23def3 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -278,7 +278,7 @@ def __init__(self, encoding): (65530, np.int8) ] ) - self.TYPE_MAP = lrange(251) + list('hlqfd') + self.TYPE_MAP = lrange(251) + list('bhlfd') self.TYPE_MAP_XML = \ dict( [ From 4a96faaa23d8278e1ba5a75315d550d466261a2a Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Wed, 26 Feb 2014 14:34:21 +0000 Subject: [PATCH 23/64] Fixed formatting in comparison after casting --- pandas/io/tests/test_stata.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index db268d20af048..955bd12ac47a6 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -277,14 +277,17 @@ def test_read_write_dta13(self): s1 = Series(2**9,dtype=np.int16) s2 = Series(2**17,dtype=np.int32) s3 = Series(2**33,dtype=np.int64) - original = DataFrame({'short':s1,'int':s2,'long':s3}) + original = DataFrame({'int16':s1,'int32':s2,'int64':s3}) original.index.name = 'index' + formatted = original + formatted['int64'] = formatted['int64'].astype(np.float64) + with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), - original) + formatted) From 238bb939eb2a475ee5c94352f887045db60515d3 Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 26 Feb 2014 19:01:34 +0000 Subject: [PATCH 24/64] Added docstring for new function and warning class --- pandas/io/stata.py | 48 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 720461f23def3..2882a7c42ba1c 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -175,26 +175,62 @@ def _datetime_to_stata_elapsed(date, fmt): raise ValueError("fmt %s not understood" % fmt) +class PossiblePrecisionLoss(Warning): + pass + + +precision_loss_doc = """ +Column converted from %s to %s, and some data are outside of the lossless +conversion range. This may result in a loss of precision in the saved data. +""" + + def _cast_to_stata_types(data): + """Checks the dtypes of the columns of a pandas DataFrame for + compatibility with the data types and ranges supported by Stata, and + converts if necessary. + + Parameters + ---------- + data : DataFrame + The DataFrame to check and convert + + Notes + ----- + Numeric columns must be one of int8, int16, int32, float32 or float64, with + some additional value restrictions on the integer data types. int8 and + int16 columns are checked for violations of the value restrictions and + upcast if needed. int64 data is not usable in Stata, and so it is + downcast to int32 whenever the value are in the int32 range, and + sidecast to float64 when larger than this range. If the int64 values + are outside of the range of those perfectly representable as float64 values, + a warning is raised. + """ + ws = '' for col in data: dtype = data[col].dtype - if dtype==np.int8: + if dtype == np.int8: if data[col].max() > 100 or data[col].min() < -127: data[col] = data[col].astype(np.int16) - elif dtype==np.int16: + elif dtype == np.int16: if data[col].max() > 32740 or data[col].min() < -32767: data[col] = data[col].astype(np.int32) - elif dtype==np.int64: + elif dtype == np.int64: if data[col].max() <= 2147483620 and data[col].min() >= -2147483647: data[col] = data[col].astype(np.int32) else: data[col] = data[col].astype(np.float64) - if data[col].max() <= 2*53 or data[col].min() >= -2**53: - from warnings import warn - warn("int64 data out of range for float64, data possibly truncated.") + if data[col].max() <= 2 * 53 or data[col].min() >= -2 ** 53: + ws = precision_loss_doc % ('int64', 'float64') + + if ws: + import warnings + + warnings.warn(ws, PossiblePrecisionLoss) return data + class StataMissingValue(StringMixin): """ An observation's missing value. From 13f56eec79deecca5a9dcb2336c129148017d789 Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 19:15:00 +0000 Subject: [PATCH 25/64] BUG: Changes types used in packing structs Types used for integer conversion where always half the size they should be. Produced a bug when exporting data tables with long integer data (np.int64). --- pandas/io/stata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 55bcbd76c2248..cd5c443b3d6a4 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -258,7 +258,7 @@ def __init__(self, encoding): (65530, np.int16) ] ) - self.TYPE_MAP = lrange(251) + list('bhlfd') + self.TYPE_MAP = lrange(251) + list('hlqfd') self.TYPE_MAP_XML = \ dict( [ From d30b445e1f7c144852bf0767eadb67ec5dffe6bf Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 19:37:21 +0000 Subject: [PATCH 26/64] Added test for integer conversion bug Added test for incorrect integer conversion from int16, int32 and int64 --- pandas/io/tests/test_stata.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 1640bee7a9929..cd46bea1d45ed 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -272,6 +272,20 @@ def test_read_write_dta12(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) + + def test_read_write_dta13(self): + s1 = Series(2**9,dtype=np.int16) + s2 = Series(2**17,dtype=np.int32) + s3 = Series(2**33,dtype=np.int64) + original = DataFrame({'short':s1,'int':s2,'long':s3}) + original.index.name = 'index' + + with tm.ensure_clean() as path: + original.to_stata(path) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), + original) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From 58bc8cea0a5d5fb17ccd1293c1f4a008c9762501 Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 23:27:55 +0000 Subject: [PATCH 27/64] Removed unintended whitespace --- pandas/io/tests/test_stata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index cd46bea1d45ed..fcc6311a08c0b 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -280,12 +280,12 @@ def test_read_write_dta13(self): original = DataFrame({'short':s1,'int':s2,'long':s3}) original.index.name = 'index' - with tm.ensure_clean() as path: + with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), original) - + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From c925923d046ec7c2f71d1e349407883060622b08 Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 23:35:52 +0000 Subject: [PATCH 28/64] Fixed another typo --- pandas/io/tests/test_stata.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index fcc6311a08c0b..db268d20af048 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -273,13 +273,13 @@ def test_read_write_dta12(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) - def test_read_write_dta13(self): + def test_read_write_dta13(self): s1 = Series(2**9,dtype=np.int16) s2 = Series(2**17,dtype=np.int32) s3 = Series(2**33,dtype=np.int64) original = DataFrame({'short':s1,'int':s2,'long':s3}) original.index.name = 'index' - + with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) @@ -287,6 +287,7 @@ def test_read_write_dta13(self): original) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) From 7fb4d1b0b3285c86529ec180f248b263242df049 Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 19:37:21 +0000 Subject: [PATCH 29/64] Added test for integer conversion bug Added test for incorrect integer conversion from int16, int32 and int64 --- pandas/io/tests/test_stata.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index db268d20af048..a74a3a210dfba 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -287,7 +287,6 @@ def test_read_write_dta13(self): original) - if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) From 9163fe88f8cbaefb56a1aae2b4487e2fc9fd9de1 Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 23:35:52 +0000 Subject: [PATCH 30/64] Fixed another typo --- pandas/io/tests/test_stata.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index a74a3a210dfba..db268d20af048 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -287,6 +287,7 @@ def test_read_write_dta13(self): original) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) From f329ed0f8e8393b40fbc00c63ee3df0f887363f6 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Wed, 26 Feb 2014 13:52:03 +0000 Subject: [PATCH 31/64] FIX: Corrected incorrect data type conversion between pandas and Stata FIX: Remove unnecessary, potentially precision degrading cast to Series when writing data ENH: Added function to cast columns from NumPy data types to Stata data types FIX: Corrected tests for correct Stata datatypes --- pandas/io/stata.py | 58 +++++++++++++++++++++++++---------- pandas/io/tests/test_stata.py | 11 +++++-- 2 files changed, 51 insertions(+), 18 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index cd5c443b3d6a4..720461f23def3 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -175,6 +175,26 @@ def _datetime_to_stata_elapsed(date, fmt): raise ValueError("fmt %s not understood" % fmt) +def _cast_to_stata_types(data): + for col in data: + dtype = data[col].dtype + if dtype==np.int8: + if data[col].max() > 100 or data[col].min() < -127: + data[col] = data[col].astype(np.int16) + elif dtype==np.int16: + if data[col].max() > 32740 or data[col].min() < -32767: + data[col] = data[col].astype(np.int32) + elif dtype==np.int64: + if data[col].max() <= 2147483620 and data[col].min() >= -2147483647: + data[col] = data[col].astype(np.int32) + else: + data[col] = data[col].astype(np.float64) + if data[col].max() <= 2*53 or data[col].min() >= -2**53: + from warnings import warn + warn("int64 data out of range for float64, data possibly truncated.") + + return data + class StataMissingValue(StringMixin): """ An observation's missing value. @@ -240,9 +260,9 @@ def __init__(self, encoding): dict( lzip(range(1, 245), ['a' + str(i) for i in range(1, 245)]) + [ - (251, np.int16), - (252, np.int32), - (253, np.int64), + (251, np.int8), + (252, np.int16), + (253, np.int32), (254, np.float32), (255, np.float64) ] @@ -253,12 +273,12 @@ def __init__(self, encoding): (32768, np.string_), (65526, np.float64), (65527, np.float32), - (65528, np.int64), - (65529, np.int32), - (65530, np.int16) + (65528, np.int32), + (65529, np.int16), + (65530, np.int8) ] ) - self.TYPE_MAP = lrange(251) + list('hlqfd') + self.TYPE_MAP = lrange(251) + list('bhlfd') self.TYPE_MAP_XML = \ dict( [ @@ -855,11 +875,12 @@ def _dtype_to_stata_type(dtype): See TYPE_MAP and comments for an explanation. This is also explained in the dta spec. 1 - 244 are strings of this length - 251 - chr(251) - for int8 and int16, byte - 252 - chr(252) - for int32, int - 253 - chr(253) - for int64, long - 254 - chr(254) - for float32, float - 255 - chr(255) - double, double + Pandas Stata + 251 - chr(251) - for int8 byte + 252 - chr(252) - for int16 int + 253 - chr(253) - for int32 long + 254 - chr(254) - for float32 float + 255 - chr(255) - for double double If there are dates to convert, then dtype will already have the correct type inserted. @@ -878,8 +899,10 @@ def _dtype_to_stata_type(dtype): elif dtype == np.int64: return chr(253) elif dtype == np.int32: + return chr(253) + elif dtype == np.int16: return chr(252) - elif dtype == np.int8 or dtype == np.int16: + elif dtype == np.int8: return chr(251) else: # pragma : no cover raise ValueError("Data type %s not currently understood. " @@ -970,7 +993,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True, self._file = _open_file_binary_write( fname, self._encoding or self._default_encoding ) - self.type_converters = {253: np.long, 252: int} + self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8} def _write(self, to_write): """ @@ -990,11 +1013,14 @@ def __init__(self, data): self.data = data def __iter__(self): - for i, row in data.iterrows(): - yield row + for row in data.itertuples(): + # First element is index, so remove + yield row[1:] if self._write_index: data = data.reset_index() + # Check columns for compatbaility with stata + data = _cast_to_stata_types(data) self.datarows = DataFrameRowIter(data) self.nobs, self.nvar = data.shape self.data = data diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index db268d20af048..c5a47f0b55621 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -128,8 +128,8 @@ def test_read_dta3(self): # match stata here expected = self.read_csv(self.csv3) expected = expected.astype(np.float32) - expected['year'] = expected['year'].astype(np.int32) - expected['quarter'] = expected['quarter'].astype(np.int16) + expected['year'] = expected['year'].astype(np.int16) + expected['quarter'] = expected['quarter'].astype(np.int8) tm.assert_frame_equal(parsed, expected) tm.assert_frame_equal(parsed_13, expected) @@ -175,6 +175,9 @@ def test_write_dta6(self): original = self.read_csv(self.csv3) original.index.name = 'index' + original.index = original.index.astype(np.int32) + original['year'] = original['year'].astype(np.int32) + original['quarter'] = original['quarter'].astype(np.int32) with tm.ensure_clean() as path: original.to_stata(path, None, False) @@ -209,6 +212,8 @@ def test_read_write_dta10(self): 'datetime']) original["object"] = Series(original["object"], dtype=object) original.index.name = 'index' + original.index = original.index.astype(np.int32) + original['integer'] = original['integer'].astype(np.int32) with tm.ensure_clean() as path: original.to_stata(path, {'datetime': 'tc'}, False) @@ -245,6 +250,7 @@ def test_read_write_dta11(self): formatted = DataFrame([(1, 2, 3, 4)], columns=['good', 'b_d', '_8number', 'astringwithmorethan32characters_']) formatted.index.name = 'index' + formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: @@ -263,6 +269,7 @@ def test_read_write_dta12(self): formatted = DataFrame([(1, 2, 3, 4)], columns=['astringwithmorethan32characters_', '_0astringwithmorethan32character', '_', '_1_']) formatted.index.name = 'index' + formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: From 9e05c8697fdea82518581a9c5bc202ae6f96d71d Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Wed, 26 Feb 2014 14:34:21 +0000 Subject: [PATCH 32/64] Fixed formatting in comparison after casting --- pandas/io/tests/test_stata.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index c5a47f0b55621..76783147b9c32 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -284,14 +284,17 @@ def test_read_write_dta13(self): s1 = Series(2**9,dtype=np.int16) s2 = Series(2**17,dtype=np.int32) s3 = Series(2**33,dtype=np.int64) - original = DataFrame({'short':s1,'int':s2,'long':s3}) + original = DataFrame({'int16':s1,'int32':s2,'int64':s3}) original.index.name = 'index' + formatted = original + formatted['int64'] = formatted['int64'].astype(np.float64) + with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), - original) + formatted) From 4d21b71c8bf7af289a75beefd47409612aa04de2 Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 26 Feb 2014 19:01:34 +0000 Subject: [PATCH 33/64] Added docstring for new function and warning class --- pandas/io/stata.py | 48 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 720461f23def3..2882a7c42ba1c 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -175,26 +175,62 @@ def _datetime_to_stata_elapsed(date, fmt): raise ValueError("fmt %s not understood" % fmt) +class PossiblePrecisionLoss(Warning): + pass + + +precision_loss_doc = """ +Column converted from %s to %s, and some data are outside of the lossless +conversion range. This may result in a loss of precision in the saved data. +""" + + def _cast_to_stata_types(data): + """Checks the dtypes of the columns of a pandas DataFrame for + compatibility with the data types and ranges supported by Stata, and + converts if necessary. + + Parameters + ---------- + data : DataFrame + The DataFrame to check and convert + + Notes + ----- + Numeric columns must be one of int8, int16, int32, float32 or float64, with + some additional value restrictions on the integer data types. int8 and + int16 columns are checked for violations of the value restrictions and + upcast if needed. int64 data is not usable in Stata, and so it is + downcast to int32 whenever the value are in the int32 range, and + sidecast to float64 when larger than this range. If the int64 values + are outside of the range of those perfectly representable as float64 values, + a warning is raised. + """ + ws = '' for col in data: dtype = data[col].dtype - if dtype==np.int8: + if dtype == np.int8: if data[col].max() > 100 or data[col].min() < -127: data[col] = data[col].astype(np.int16) - elif dtype==np.int16: + elif dtype == np.int16: if data[col].max() > 32740 or data[col].min() < -32767: data[col] = data[col].astype(np.int32) - elif dtype==np.int64: + elif dtype == np.int64: if data[col].max() <= 2147483620 and data[col].min() >= -2147483647: data[col] = data[col].astype(np.int32) else: data[col] = data[col].astype(np.float64) - if data[col].max() <= 2*53 or data[col].min() >= -2**53: - from warnings import warn - warn("int64 data out of range for float64, data possibly truncated.") + if data[col].max() <= 2 * 53 or data[col].min() >= -2 ** 53: + ws = precision_loss_doc % ('int64', 'float64') + + if ws: + import warnings + + warnings.warn(ws, PossiblePrecisionLoss) return data + class StataMissingValue(StringMixin): """ An observation's missing value. From 07b18851c7587d5bbb647cd84b99fbc9bc8a2a2c Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 19:15:00 +0000 Subject: [PATCH 34/64] BUG: Changes types used in packing structs Types used for integer conversion where always half the size they should be. Produced a bug when exporting data tables with long integer data (np.int64). --- pandas/io/stata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 2882a7c42ba1c..fce2847563406 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -314,7 +314,7 @@ def __init__(self, encoding): (65530, np.int8) ] ) - self.TYPE_MAP = lrange(251) + list('bhlfd') + self.TYPE_MAP = lrange(251) + list('hlqfd') self.TYPE_MAP_XML = \ dict( [ From a0a0cad2b763be9eb7b03a304d7bc388f10182b3 Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 19:37:21 +0000 Subject: [PATCH 35/64] Added test for integer conversion bug Added test for incorrect integer conversion from int16, int32 and int64 --- pandas/io/tests/test_stata.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 76783147b9c32..62004d2c16bce 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -297,7 +297,6 @@ def test_read_write_dta13(self): formatted) - if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) From f7aaa9e4e70284a91a2a4407bbfac84bf0891538 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Wed, 26 Feb 2014 13:52:03 +0000 Subject: [PATCH 36/64] FIX: Corrected incorrect data type conversion between pandas and Stata FIX: Remove unnecessary, potentially precision degrading cast to Series when writing data ENH: Added function to cast columns from NumPy data types to Stata data types FIX: Corrected tests for correct Stata datatypes --- pandas/io/stata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index fce2847563406..2882a7c42ba1c 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -314,7 +314,7 @@ def __init__(self, encoding): (65530, np.int8) ] ) - self.TYPE_MAP = lrange(251) + list('hlqfd') + self.TYPE_MAP = lrange(251) + list('bhlfd') self.TYPE_MAP_XML = \ dict( [ From 16611586268ee6d44ed60e0fe823ef785e06fbc1 Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 19:15:00 +0000 Subject: [PATCH 37/64] BUG: Changes types used in packing structs Types used for integer conversion where always half the size they should be. Produced a bug when exporting data tables with long integer data (np.int64). --- pandas/io/stata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 2882a7c42ba1c..fce2847563406 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -314,7 +314,7 @@ def __init__(self, encoding): (65530, np.int8) ] ) - self.TYPE_MAP = lrange(251) + list('bhlfd') + self.TYPE_MAP = lrange(251) + list('hlqfd') self.TYPE_MAP_XML = \ dict( [ From 00232bc165defdac1f280dbb2a6880f88dccbf3c Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 19:37:21 +0000 Subject: [PATCH 38/64] Added test for integer conversion bug Added test for incorrect integer conversion from int16, int32 and int64 --- pandas/io/tests/test_stata.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 62004d2c16bce..06d64cb1841dc 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -128,8 +128,8 @@ def test_read_dta3(self): # match stata here expected = self.read_csv(self.csv3) expected = expected.astype(np.float32) - expected['year'] = expected['year'].astype(np.int16) - expected['quarter'] = expected['quarter'].astype(np.int8) + expected['year'] = expected['year'].astype(np.int32) + expected['quarter'] = expected['quarter'].astype(np.int16) tm.assert_frame_equal(parsed, expected) tm.assert_frame_equal(parsed_13, expected) @@ -175,9 +175,6 @@ def test_write_dta6(self): original = self.read_csv(self.csv3) original.index.name = 'index' - original.index = original.index.astype(np.int32) - original['year'] = original['year'].astype(np.int32) - original['quarter'] = original['quarter'].astype(np.int32) with tm.ensure_clean() as path: original.to_stata(path, None, False) @@ -212,8 +209,6 @@ def test_read_write_dta10(self): 'datetime']) original["object"] = Series(original["object"], dtype=object) original.index.name = 'index' - original.index = original.index.astype(np.int32) - original['integer'] = original['integer'].astype(np.int32) with tm.ensure_clean() as path: original.to_stata(path, {'datetime': 'tc'}, False) @@ -250,7 +245,6 @@ def test_read_write_dta11(self): formatted = DataFrame([(1, 2, 3, 4)], columns=['good', 'b_d', '_8number', 'astringwithmorethan32characters_']) formatted.index.name = 'index' - formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: @@ -269,7 +263,6 @@ def test_read_write_dta12(self): formatted = DataFrame([(1, 2, 3, 4)], columns=['astringwithmorethan32characters_', '_0astringwithmorethan32character', '_', '_1_']) formatted.index.name = 'index' - formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: @@ -281,10 +274,10 @@ def test_read_write_dta12(self): tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) def test_read_write_dta13(self): - s1 = Series(2**9,dtype=np.int16) - s2 = Series(2**17,dtype=np.int32) - s3 = Series(2**33,dtype=np.int64) - original = DataFrame({'int16':s1,'int32':s2,'int64':s3}) + s1 = Series(2**9, dtype=np.int16) + s2 = Series(2**17, dtype=np.int32) + s3 = Series(2**33, dtype=np.int64) + original = DataFrame({'int16': s1, 'int32': s2, 'int64': s3}) original.index.name = 'index' formatted = original @@ -296,7 +289,6 @@ def test_read_write_dta13(self): tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) - if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) From f8de1996e4e25fcacbbbb739f0c1d9bb56d7cdfd Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 23:35:52 +0000 Subject: [PATCH 39/64] Fixed another typo --- pandas/io/tests/test_stata.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 06d64cb1841dc..37961357f21bb 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -289,6 +289,7 @@ def test_read_write_dta13(self): tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) From 02e44721611a75741840b60db561a0fb37eec95e Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Wed, 26 Feb 2014 13:58:22 +0000 Subject: [PATCH 40/64] Removed unintended branch merge --- pandas/io/stata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index fce2847563406..2882a7c42ba1c 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -314,7 +314,7 @@ def __init__(self, encoding): (65530, np.int8) ] ) - self.TYPE_MAP = lrange(251) + list('hlqfd') + self.TYPE_MAP = lrange(251) + list('bhlfd') self.TYPE_MAP_XML = \ dict( [ From 9788ad19db036ba8eded8b66b16b45ce392d24f7 Mon Sep 17 00:00:00 2001 From: immerrr Date: Sat, 22 Feb 2014 11:58:28 +0400 Subject: [PATCH 41/64] PERF: optimize index.__getitem__ for slice & boolean mask indexers --- doc/source/release.rst | 2 ++ doc/source/v0.14.0.txt | 15 +++++++++++ pandas/core/index.py | 51 +++++++++++++++++++------------------- pandas/core/internals.py | 2 +- pandas/tests/test_index.py | 27 ++++++++++++++++++++ pandas/tseries/index.py | 2 -- pandas/tseries/period.py | 2 -- vb_suite/index_object.py | 13 ++++++++++ 8 files changed, 83 insertions(+), 31 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 9b7f1b619f90f..7674cc9f35622 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -105,6 +105,8 @@ API Changes - ``NameResolutionError`` was removed because it isn't necessary anymore. - ``concat`` will now concatenate mixed Series and DataFrames using the Series name or numbering columns as needed (:issue:`2385`) +- Slicing and advanced/boolean indexing operations on ``Index`` classes will no + longer change type of the resulting index (:issue:`6440`). Experimental Features ~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index ada29dc674420..4432e9e891e7d 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -78,6 +78,21 @@ These are out-of-bounds selections - ``NameResolutionError`` was removed because it isn't necessary anymore. - ``concat`` will now concatenate mixed Series and DataFrames using the Series name or numbering columns as needed (:issue:`2385`). See :ref:`the docs ` +- Slicing and advanced/boolean indexing operations on ``Index`` classes will no + longer change type of the resulting index (:issue:`6440`) + + .. ipython:: python + + i = pd.Index([1, 2, 3, 'a' , 'b', 'c']) + i[[0,1,2]] + + Previously, the above operation would return ``Int64Index``. If you'd like + to do this manually, use :meth:`Index.astype` + + .. ipython:: python + + i[[0,1,2]].astype(np.int_) + MultiIndexing Using Slicers ~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/index.py b/pandas/core/index.py index 4a4086c4eeb0c..c16e2eff06904 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -631,34 +631,35 @@ def __hash__(self): raise TypeError("unhashable type: %r" % type(self).__name__) def __getitem__(self, key): - """Override numpy.ndarray's __getitem__ method to work as desired""" - arr_idx = self.view(np.ndarray) + """ + Override numpy.ndarray's __getitem__ method to work as desired. + + This function adds lists and Series as valid boolean indexers + (ndarrays only supports ndarray with dtype=bool). + + If resulting ndim != 1, plain ndarray is returned instead of + corresponding `Index` subclass. + + """ + # There's no custom logic to be implemented in __getslice__, so it's + # not overloaded intentionally. + __getitem__ = super(Index, self).__getitem__ if np.isscalar(key): - return arr_idx[key] - else: - if com._is_bool_indexer(key): - key = np.asarray(key) + return __getitem__(key) - try: - result = arr_idx[key] - if result.ndim > 1: - return result - except (IndexError): - if not len(key): - result = [] - else: - raise + if isinstance(key, slice): + # This case is separated from the conditional above to avoid + # pessimization of basic indexing. + return __getitem__(key) - return Index(result, name=self.name) + if com._is_bool_indexer(key): + return __getitem__(np.asarray(key)) - def _getitem_slice(self, key): - """ getitem for a bool/sliceable, fallback to standard getitem """ - try: - arr_idx = self.view(np.ndarray) - result = arr_idx[key] - return self.__class__(result, name=self.name, fastpath=True) - except: - return self.__getitem__(key) + result = __getitem__(key) + if result.ndim > 1: + return result.view(np.ndarray) + else: + return result def append(self, other): """ @@ -2800,8 +2801,6 @@ def __getitem__(self, key): return result - _getitem_slice = __getitem__ - def take(self, indexer, axis=None): """ Analogous to ndarray.take diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 10017f89e5204..74a8ce0118d88 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3737,7 +3737,7 @@ def get_slice(self, slobj, raise_on_error=False): if raise_on_error: _check_slice_bounds(slobj, self.index) return self.__class__(self._block._slice(slobj), - self.index._getitem_slice(slobj), fastpath=True) + self.index[slobj], fastpath=True) def set_axis(self, axis, value, maybe_rename=True, check_axis=True): cur_axis, value = self._set_axis(axis, value, check_axis) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index e828bc100dfcf..3e578a5e36bb1 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -323,6 +323,25 @@ def test_fancy(self): for i in sl: self.assertEqual(i, sl[sl.get_loc(i)]) + def test_empty_fancy(self): + empty_farr = np.array([], dtype=np.float_) + empty_iarr = np.array([], dtype=np.int_) + empty_barr = np.array([], dtype=np.bool_) + + # pd.DatetimeIndex is excluded, because it overrides getitem and should + # be tested separately. + for idx in [self.strIndex, self.intIndex, self.floatIndex]: + empty_idx = idx.__class__([]) + values = idx.values + + self.assert_(idx[[]].identical(empty_idx)) + self.assert_(idx[empty_iarr].identical(empty_idx)) + self.assert_(idx[empty_barr].identical(empty_idx)) + + # np.ndarray only accepts ndarray of int & bool dtypes, so should + # Index. + self.assertRaises(IndexError, idx.__getitem__, empty_farr) + def test_getitem(self): arr = np.array(self.dateIndex) exp = self.dateIndex[5] @@ -762,6 +781,14 @@ def test_join_self(self): joined = res.join(res, how=kind) self.assertIs(res, joined) + def test_indexing_doesnt_change_class(self): + idx = Index([1, 2, 3, 'a', 'b', 'c']) + + self.assert_(idx[1:3].identical( + pd.Index([2, 3], dtype=np.object_))) + self.assert_(idx[[0,1]].identical( + pd.Index([1, 2], dtype=np.object_))) + class TestFloat64Index(tm.TestCase): _multiprocess_can_split_ = True diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index f81634f45bdb2..c58447acec621 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1406,8 +1406,6 @@ def __getitem__(self, key): return self._simple_new(result, self.name, new_offset, self.tz) - _getitem_slice = __getitem__ - # Try to run function on index first, and then on elements of index # Especially important for group-by functionality def map(self, f): diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 337533ad29f4f..5fca119c14e83 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -1056,8 +1056,6 @@ def __getitem__(self, key): return PeriodIndex(result, name=self.name, freq=self.freq) - _getitem_slice = __getitem__ - def _format_with_header(self, header, **kwargs): return header + self._format_native_types(**kwargs) diff --git a/vb_suite/index_object.py b/vb_suite/index_object.py index 8b348ddc6e6cc..2cfdffdc38541 100644 --- a/vb_suite/index_object.py +++ b/vb_suite/index_object.py @@ -46,3 +46,16 @@ index_int64_intersection = Benchmark('left.intersection(right)', setup, start_date=datetime(2011, 1, 1)) + +#---------------------------------------------------------------------- +# string index slicing +setup = common_setup + """ +idx = tm.makeStringIndex(1000000) + +mask = np.arange(1000000) % 3 == 0 +series_mask = Series(mask) +""" +index_str_slice_indexer_basic = Benchmark('idx[:-1]', setup) +index_str_slice_indexer_even = Benchmark('idx[::2]', setup) +index_str_boolean_indexer = Benchmark('idx[mask]', setup) +index_str_boolean_series_indexer = Benchmark('idx[series_mask]', setup) From cda421691d4e5e2bf1c5a9dc80ef84514327af8d Mon Sep 17 00:00:00 2001 From: bashtage Date: Fri, 28 Feb 2014 14:36:00 +0000 Subject: [PATCH 42/64] BUG: Fixes and tests for extreme values in all data types The extreme values of float and double (Stata, pandas eqiv: float 32 and float64) were not correct. This resulted in incorrect truncation. The handling of missing values have been improved and code to convert missing values in any format has been added. The improvement differentiated between valid ranges for data and missing values. Additional issues were found when handling missing Dates, where missing Dates (NaT) were converted to non-missing dates when written. A test has been added for extreme numeric values as well as missing values. --- pandas/io/stata.py | 54 +++++++++++++++++++++++++------- pandas/io/tests/data/stata5.csv | 13 ++++++++ pandas/io/tests/data/stata5.dta | Bin 0 -> 4924 bytes pandas/io/tests/test_stata.py | 10 ++++++ 4 files changed, 65 insertions(+), 12 deletions(-) create mode 100644 pandas/io/tests/data/stata5.csv create mode 100644 pandas/io/tests/data/stata5.dta diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 2882a7c42ba1c..1c2ae8ea27baa 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -23,7 +23,7 @@ from pandas.compat import long, lrange, lmap, lzip from pandas import isnull from pandas.io.common import get_filepath_or_buffer - +from pandas.tslib import NaT def read_stata(filepath_or_buffer, convert_dates=True, convert_categoricals=True, encoding=None, index=None): @@ -150,6 +150,11 @@ def _datetime_to_stata_elapsed(date, fmt): if not isinstance(date, datetime.datetime): raise ValueError("date should be datetime.datetime format") stata_epoch = datetime.datetime(1960, 1, 1) + # Handle NaTs + if date is NaT: + # Missing value for dates ('.'), assumed always double + # TODO: Should be moved so a const somewhere, and consolidated + return struct.unpack(' nmax: if self._missing_values: return StataMissingValue(nmax, d) @@ -1243,7 +1273,7 @@ def _write_data_dates(self): self._write(var) else: if isnull(var): # this only matters for floats - var = MISSING_VALUES[typ] + var = MISSING_VALUES[TYPE_MAP[typ]] self._file.write(struct.pack(byteorder+TYPE_MAP[typ], var)) def _null_terminate(self, s, as_string=False): diff --git a/pandas/io/tests/data/stata5.csv b/pandas/io/tests/data/stata5.csv new file mode 100644 index 0000000000000..cc597582472e3 --- /dev/null +++ b/pandas/io/tests/data/stata5.csv @@ -0,0 +1,13 @@ +byte_,int_,long_,float_,double_,date_td,string_,string_1 +0,0,0,0,0,,"a","a" +1,1,1,1,1,,"ab","b" +-1,-1,-1,-1,-1,,"abc","c" +100,32740,-2.15e+09,-1.70e+38,-2.0e+307,01jan1970,"abcdefghijklmnop","d" +-127,-32767,2.15e+09,1.70e+38,8.0e+307,02jan1970,"abcdefghijklmnopqrstuvwxyz","e" +,0,,,,01jan2014,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","f" +0,,,,,01jan2114,"1234567890","1" +,,0,,,31dec2014,"This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted","2" +.a,.a,.a,.a,.a,29feb2012,"!","A" +.z,.z,.z,.z,.z,,"&","Z" +,,,0,,,"1.23","!" +,,,,0,,"10jan1970","." diff --git a/pandas/io/tests/data/stata5.dta b/pandas/io/tests/data/stata5.dta new file mode 100644 index 0000000000000000000000000000000000000000..4ee2ca902e757a48a0da219ebc6b70b13e1ff0d7 GIT binary patch literal 4924 zcmeHK%TE(Q7@q|bIT%kK)C)rhL=)3=mm-!Bg(3ytDBuH0)@^s&U1+yucX(;ymK!9- zN8-sx;=$L!Kfr^;vnD1UJ$N7yxI)w@iO4!LI}w0xR!uYSd2hXYHk$4kCmUR?&?<|F@ZE0ghg-C4O+uQ*j~aRlQYvLBcw7p zypjzWzKZ`v)30T|`2B`rlF z_NLJI9O|O>)@JaH4R&|&+qS<8!2f`Mj%NTdEDoBZU>mu#ijwQP#J2NGUn}vOU7n5+ z{-n0klMlb6@)USWYh7S6VB=<)T=^zV{XcU%35to^3+&P)x{~W54wz;>4bym44|A17 z=A2&%{1PU%wWJ6Fo1<{Vvn>(sC$Tx`)L_2V7Z?#js_g zkww+&;%%|^qaE?iu4BhfoIG{FTU%ptRz`U~#iMX4duc(CbEGzCycjGl|E9~bNGn1bRYZOf1#W;Th^&^RkkmjFLBgWuHx~Td bAP_iWcu+CLXj7O-KH|0IQ_*fY?2*tf*7&hK literal 0 HcmV?d00001 diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 359bd712dba87..0d48c3b019c82 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -43,6 +43,8 @@ def setUp(self): self.dta2_13 = os.path.join(self.dirpath, 'stata2_v13.dta') self.dta3_13 = os.path.join(self.dirpath, 'stata3_v13.dta') self.dta4_13 = os.path.join(self.dirpath, 'stata4_v13.dta') + self.csv14 = os.path.join(self.dirpath, 'stata5.csv') + self.dta14 = os.path.join(self.dirpath, 'stata5.dta') def read_dta(self, file): return read_stata(file, convert_dates=True) @@ -296,6 +298,14 @@ def test_read_write_dta13(self): tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) + def test_read_write_reread_dat14(self): + parsed = self.read_dta(self.dta14) + parsed.index.name = 'index' + with tm.ensure_clean() as path: + parsed.to_stata(path, {'date_td': 'tc'}, write_index=False) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), parsed) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From 3bde9c95c89672796f06d32bc82c9ed620d04900 Mon Sep 17 00:00:00 2001 From: bashtage Date: Fri, 28 Feb 2014 15:39:11 +0000 Subject: [PATCH 43/64] BUG: Fixes and tests for extreme values in all data types The extreme values of float and double (Stata, pandas eqiv: float 32 and float64) were not correct. This resulted in incorrect truncation. The handling of missing values have been improved and code to convert missing values in any format has been added. The improvement differentiated between valid ranges for data and missing values. Additional issues were found when handling missing Dates, where missing Dates (NaT) were converted to non-missing dates when written. A test has been added for extreme numeric values as well as missing values. --- pandas/io/stata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 1c2ae8ea27baa..e885fa6fcd990 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -343,7 +343,7 @@ def __init__(self, encoding): # that can be represented. it's the 27 ABOVE and BELOW the max listed # numeric data type in [U] 12.2.2 of the 11.2 manual float32_min = '\xff\xff\xff\xfe' - float32_max = '\xff\xff\xff\xfe' + float32_max = '\xff\xff\xff\x7e' float64_min = '\xff\xff\xff\xff\xff\xff\xef\xff' float64_max = '\xff\xff\xff\xff\xff\xff\xdf\x7f' self.VALID_RANGE = \ From 20d61915784b613f0442764b8f313b119ce6516b Mon Sep 17 00:00:00 2001 From: bashtage Date: Fri, 28 Feb 2014 16:15:37 +0000 Subject: [PATCH 44/64] BUG: Fixes and tests for extreme values in all data types The extreme values of float and double (Stata, pandas eqiv: float 32 and float64) were not correct. This resulted in incorrect truncation. The handling of missing values have been improved and code to convert missing values in any format has been added. The improvement differentiated between valid ranges for data and missing values. Additional issues were found when handling missing Dates, where missing Dates (NaT) were converted to non-missing dates when written. A test has been added for extreme numeric values as well as missing values. --- pandas/io/stata.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index e885fa6fcd990..74e0326dfecf8 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -154,7 +154,7 @@ def _datetime_to_stata_elapsed(date, fmt): if date is NaT: # Missing value for dates ('.'), assumed always double # TODO: Should be moved so a const somewhere, and consolidated - return struct.unpack(' Date: Fri, 28 Feb 2014 16:23:35 +0000 Subject: [PATCH 45/64] Disabled the big endian skips --- pandas/io/tests/test_stata.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 0d48c3b019c82..d01e26ce2f378 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -159,7 +159,7 @@ def test_read_dta4(self): tm.assert_frame_equal(parsed_13, expected) def test_read_write_dta5(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], columns=['float_miss', 'double_miss', 'byte_miss', @@ -173,7 +173,7 @@ def test_read_write_dta5(self): original) def test_write_dta6(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = self.read_csv(self.csv3) original.index.name = 'index' @@ -206,7 +206,7 @@ def test_read_dta9(self): tm.assert_frame_equal(parsed, expected) def test_read_write_dta10(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = DataFrame(data=[["string", "object", 1, 1.1, np.datetime64('2003-12-25')]], @@ -245,7 +245,7 @@ def test_encoding(self): self.assert_(isinstance(result, unicode)) def test_read_write_dta11(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = DataFrame([(1, 2, 3, 4)], columns=['good', compat.u('b\u00E4d'), '8number', 'astringwithmorethan32characters______']) @@ -264,7 +264,7 @@ def test_read_write_dta11(self): tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) def test_read_write_dta12(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = DataFrame([(1, 2, 3, 4)], columns=['astringwithmorethan32characters_1', 'astringwithmorethan32characters_2', '+', '-']) From a66ae27915e76d984a4478acab1a66720c3f2aa4 Mon Sep 17 00:00:00 2001 From: bashtage Date: Fri, 28 Feb 2014 17:01:44 +0000 Subject: [PATCH 46/64] Fixed legacy date issue with format 114 files Added test for 114 files --- pandas/io/stata.py | 4 ++-- pandas/io/tests/test_stata.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 74e0326dfecf8..e360fa2a86771 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -48,7 +48,7 @@ def read_stata(filepath_or_buffer, convert_dates=True, return reader.data(convert_dates, convert_categoricals, index) -_date_formats = ["%tc", "%tC", "%td", "%tw", "%tm", "%tq", "%th", "%ty"] +_date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"] def _stata_elapsed_date_to_datetime(date, fmt): @@ -109,7 +109,7 @@ def _stata_elapsed_date_to_datetime(date, fmt): from warnings import warn warn("Encountered %tC format. Leaving in Stata Internal Format.") return date - elif fmt in ["%td", "td"]: + elif fmt in ["%td", "td", "%d", "d"]: return stata_epoch + datetime.timedelta(int(date)) elif fmt in ["%tw", "tw"]: # does not count leap days - 7 days is a week year = datetime.datetime(stata_epoch.year + date // 52, 1, 1) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index d01e26ce2f378..afb8241cc0062 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -301,6 +301,9 @@ def test_read_write_dta13(self): def test_read_write_reread_dat14(self): parsed = self.read_dta(self.dta14) parsed.index.name = 'index' + parsed_10 = self.read_dta(self.dta14_10) + parsed_10.index.name = 'index' + tm.assert_frame_equal(parsed_10,parsed) with tm.ensure_clean() as path: parsed.to_stata(path, {'date_td': 'tc'}, write_index=False) written_and_read_again = self.read_dta(path) From 840efe6c92b7d011fd096f688e8fefa88bb0ec6c Mon Sep 17 00:00:00 2001 From: bashtage Date: Fri, 28 Feb 2014 17:02:38 +0000 Subject: [PATCH 47/64] Added format 114 (Stata 9/10/11) data file --- pandas/io/tests/data/stata5_v10.dta | Bin 0 -> 4924 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 pandas/io/tests/data/stata5_v10.dta diff --git a/pandas/io/tests/data/stata5_v10.dta b/pandas/io/tests/data/stata5_v10.dta new file mode 100644 index 0000000000000000000000000000000000000000..76de9a40090a7b43e6e4cdeba156bc8cb3e36bbc GIT binary patch literal 4924 zcmeHKO-vI(6n+c-a6r9yP-DUnQi2I-+ERXMh(dvaii+~1;z76F?GN;ic99>6TcU}O z7{%zRYU07agEtdmqQryIM2(4Fj0X-R0#_R~5hJqB%+5kv+Um)emX~ziyf@!_JM(4U z%run8K>-wkDvdwRb6eg-VGQeS7oJ1i&THJ0ktjw0JW>>epr$WIDOkdo5Ayr|*76O| zRlc6)C>yTaM1CAbOqgnrO|&DE!DvRts@-M-{9OFCq%3{o>_d`x3JgY?CspI|OL#m3 zz#H(pGG=gk13b==UW7ou-HT_DRsi@s1;>OV6u zI5f)7!V z-Py~96XxVqTs%J6Y&2DvE32w&EC$FKE{x(F+YfR3B6ic(;r4`)`u_#F`7pu{Zsc(D zA>JX0p>R102ax3ECFBVq43yj=^6`TnU%wA|MZZgOqo5e_c_c{`kbMZXOT5JE(SM%Q z%^1_&-%sb^ukpK9$=zLGyh@2!pV zSE+8eQ8GXPb%SpD9+~)jiM|hlTh+MR_B+(p@;>NM@sXllnpeXk_MNfI;}lN^;~EWL z`(4eit)uL|iYIoxtjKs$GQ4A4?7&>$v!j&d$t5njagDI^f~{WV!l*Y@5JxU?QM|&1 ec5V;~WTT;n_ZuzM24cmDwzuM3w3`hzqW=vqeAQ3@ literal 0 HcmV?d00001 From 661ab241e4e842590bb91ca4161cb289f121472f Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Fri, 28 Feb 2014 19:06:43 +0000 Subject: [PATCH 48/64] Add test for Stata data with file format 114 --- pandas/io/tests/test_stata.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index afb8241cc0062..0b8cb2be2371d 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -45,6 +45,7 @@ def setUp(self): self.dta4_13 = os.path.join(self.dirpath, 'stata4_v13.dta') self.csv14 = os.path.join(self.dirpath, 'stata5.csv') self.dta14 = os.path.join(self.dirpath, 'stata5.dta') + self.dta14_10 = os.path.join(self.dirpath, 'stata5_v10.dta') def read_dta(self, file): return read_stata(file, convert_dates=True) From 61b141bc934c7e234f9675234a578f043f3bbdd1 Mon Sep 17 00:00:00 2001 From: DSM Date: Sat, 1 Mar 2014 10:01:02 -0500 Subject: [PATCH 49/64] ENH: add method='dense' to rank --- doc/source/release.rst | 1 + pandas/algos.pyx | 42 ++++++++++++++++++++++++++++++++------ pandas/core/frame.py | 3 ++- pandas/core/series.py | 3 ++- pandas/tests/test_stats.py | 22 ++++++++++++++++++-- 5 files changed, 61 insertions(+), 10 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 3c6d2643e3fff..08bfcbe42ad5b 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -127,6 +127,7 @@ Improvements to existing features - Performance improvement in indexing into a multi-indexed Series (:issue:`5567`) - Testing statements updated to use specialized asserts (:issue:`6175`) - ``Series.rank()`` now has a percentage rank option (:issue:`5971`) +- ``Series.rank()`` and ``DataFrame.rank()`` now accept ``method='dense'`` for ranks without gaps (:issue:`6514`) - ``quotechar``, ``doublequote``, and ``escapechar`` can now be specified when using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`) - perf improvements in DataFrame construction with certain offsets, by removing faulty caching diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 7f406611c82f7..14c9ec2f3355d 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -68,12 +68,14 @@ cdef: int TIEBREAK_MAX = 2 int TIEBREAK_FIRST = 3 int TIEBREAK_FIRST_DESCENDING = 4 + int TIEBREAK_DENSE = 5 tiebreakers = { 'average' : TIEBREAK_AVERAGE, 'min' : TIEBREAK_MIN, 'max' : TIEBREAK_MAX, - 'first' : TIEBREAK_FIRST + 'first' : TIEBREAK_FIRST, + 'dense' : TIEBREAK_DENSE, } @@ -137,7 +139,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True, """ cdef: - Py_ssize_t i, j, n, dups = 0 + Py_ssize_t i, j, n, dups = 0, total_tie_count = 0 ndarray[float64_t] sorted_data, ranks, values ndarray[int64_t] argsorted float64_t val, nan_value @@ -200,6 +202,10 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True, elif tiebreak == TIEBREAK_FIRST_DESCENDING: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count sum_ranks = dups = 0 if pct: return ranks / count @@ -214,7 +220,7 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True, """ cdef: - Py_ssize_t i, j, n, dups = 0 + Py_ssize_t i, j, n, dups = 0, total_tie_count = 0 ndarray[int64_t] sorted_data, values ndarray[float64_t] ranks ndarray[int64_t] argsorted @@ -265,6 +271,10 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True, elif tiebreak == TIEBREAK_FIRST_DESCENDING: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count sum_ranks = dups = 0 if pct: return ranks / count @@ -279,7 +289,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average', """ cdef: - Py_ssize_t i, j, z, k, n, dups = 0 + Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 ndarray[float64_t, ndim=2] ranks, values ndarray[int64_t, ndim=2] argsorted float64_t val, nan_value @@ -324,6 +334,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average', for i in range(n): dups = sum_ranks = 0 + total_tie_count = 0 for j in range(k): sum_ranks += j + 1 dups += 1 @@ -347,6 +358,10 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average', elif tiebreak == TIEBREAK_FIRST_DESCENDING: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = total_tie_count sum_ranks = dups = 0 if axis == 0: @@ -362,7 +377,7 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average', """ cdef: - Py_ssize_t i, j, z, k, n, dups = 0 + Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 ndarray[float64_t, ndim=2] ranks ndarray[int64_t, ndim=2] argsorted ndarray[int64_t, ndim=2, cast=True] values @@ -395,6 +410,7 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average', for i in range(n): dups = sum_ranks = 0 + total_tie_count = 0 for j in range(k): sum_ranks += j + 1 dups += 1 @@ -415,6 +431,10 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average', elif tiebreak == TIEBREAK_FIRST_DESCENDING: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = total_tie_count sum_ranks = dups = 0 if axis == 0: @@ -430,7 +450,7 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average', """ cdef: - Py_ssize_t i, j, n, dups = 0 + Py_ssize_t i, j, n, dups = 0, total_tie_count = 0 ndarray[float64_t] ranks ndarray sorted_data, values ndarray[int64_t] argsorted @@ -502,6 +522,10 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average', ranks[argsorted[j]] = i + 1 elif tiebreak == TIEBREAK_FIRST: raise ValueError('first not supported for non-numeric data') + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count sum_ranks = dups = 0 if pct: ranks / count @@ -545,6 +569,7 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average', cdef: Py_ssize_t i, j, z, k, n, infs, dups = 0 + Py_ssize_t total_tie_count = 0 ndarray[float64_t, ndim=2] ranks ndarray[object, ndim=2] values ndarray[int64_t, ndim=2] argsorted @@ -600,6 +625,7 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average', for i in range(n): dups = sum_ranks = infs = 0 + total_tie_count = 0 for j in range(k): val = values[i, j] if val is nan_value and keep_na: @@ -621,6 +647,10 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average', elif tiebreak == TIEBREAK_FIRST: raise ValueError('first not supported for ' 'non-numeric data') + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = total_tie_count sum_ranks = dups = 0 if axis == 0: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 228fa1fd08a5f..6c1037f018e02 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4182,11 +4182,12 @@ def rank(self, axis=0, numeric_only=None, method='average', Ranks over columns (0) or rows (1) numeric_only : boolean, default None Include only float, int, boolean data - method : {'average', 'min', 'max', 'first'} + method : {'average', 'min', 'max', 'first', 'dense'} * average: average rank of group * min: lowest rank in group * max: highest rank in group * first: ranks assigned in order they appear in the array + * dense: like 'min', but rank always increases by 1 between groups na_option : {'keep', 'top', 'bottom'} * keep: leave NA values where they are * top: smallest rank if ascending diff --git a/pandas/core/series.py b/pandas/core/series.py index 5d6115b0e4ef9..9e6c0bd9305ab 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1720,11 +1720,12 @@ def rank(self, method='average', na_option='keep', ascending=True, Parameters ---------- - method : {'average', 'min', 'max', 'first'} + method : {'average', 'min', 'max', 'first', 'dense'} * average: average rank of group * min: lowest rank in group * max: highest rank in group * first: ranks assigned in order they appear in the array + * dense: like 'min', but rank always increases by 1 between groups na_option : {'keep'} keep: leave NA values where they are ascending : boolean, default True diff --git a/pandas/tests/test_stats.py b/pandas/tests/test_stats.py index 7e2144e801122..cb3fdcafd4056 100644 --- a/pandas/tests/test_stats.py +++ b/pandas/tests/test_stats.py @@ -12,7 +12,6 @@ assert_almost_equal) import pandas.util.testing as tm - class TestRank(tm.TestCase): _multiprocess_can_split_ = True s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]) @@ -23,7 +22,8 @@ class TestRank(tm.TestCase): 3.5, 1.5, 8.0, nan, 5.5]), 'min': np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]), 'max': np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]), - 'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]) + 'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]), + 'dense': np.array([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]), } def test_rank_tie_methods(self): @@ -43,6 +43,24 @@ def _check(s, expected, method='average'): series = s if dtype is None else s.astype(dtype) _check(series, results[method], method=method) + def test_rank_dense_method(self): + dtypes = ['O', 'f8', 'i8'] + in_out = [([1], [1]), + ([2], [1]), + ([0], [1]), + ([2,2], [1,1]), + ([1,2,3], [1,2,3]), + ([4,2,1], [3,2,1],), + ([1,1,5,5,3], [1,1,3,3,2]), + ([-5,-4,-3,-2,-1], [1,2,3,4,5])] + + for ser, exp in in_out: + for dtype in dtypes: + s = Series(ser).astype(dtype) + result = s.rank(method='dense') + expected = Series(exp).astype(result.dtype) + assert_series_equal(result, expected) + def test_rank_descending(self): dtypes = ['O', 'f8', 'i8'] From 1dc157c6618efcfa46c2df15c9b1ec62f020202b Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Sat, 1 Mar 2014 17:05:46 +0000 Subject: [PATCH 50/64] Added additional data files for testing alternative Stata file formats --- pandas/io/stata.py | 11 +++-- pandas/io/tests/data/stata2_113.dta | Bin 0 -> 1490 bytes pandas/io/tests/data/stata2_114.dta | Bin 0 -> 1786 bytes pandas/io/tests/data/stata2_115.dta | Bin 0 -> 1786 bytes pandas/io/tests/data/stata3_113.dta | Bin 0 -> 12737 bytes pandas/io/tests/data/stata3_114.dta | Bin 0 -> 13255 bytes pandas/io/tests/data/stata3_115.dta | Bin 0 -> 13255 bytes pandas/io/tests/data/stata4_113.dta | Bin 0 -> 1528 bytes pandas/io/tests/data/stata4_114.dta | Bin 0 -> 1713 bytes pandas/io/tests/data/stata4_115.dta | Bin 0 -> 1713 bytes pandas/io/tests/data/stata5_113.dta | Bin 0 -> 4628 bytes pandas/io/tests/data/stata5_114.dta | Bin 0 -> 4924 bytes pandas/io/tests/data/stata5_115.dta | Bin 0 -> 4924 bytes pandas/io/tests/data/stata6.csv | 6 +++ pandas/io/tests/data/stata6.dta | Bin 0 -> 3048 bytes pandas/io/tests/data/stata6_113.dta | Bin 0 -> 2752 bytes pandas/io/tests/data/stata6_114.dta | Bin 0 -> 3048 bytes pandas/io/tests/data/stata6_115.dta | Bin 0 -> 3048 bytes pandas/io/tests/test_stata.py | 73 +++++++++++++++++++++++++--- 19 files changed, 78 insertions(+), 12 deletions(-) create mode 100644 pandas/io/tests/data/stata2_113.dta create mode 100644 pandas/io/tests/data/stata2_114.dta create mode 100644 pandas/io/tests/data/stata2_115.dta create mode 100644 pandas/io/tests/data/stata3_113.dta create mode 100644 pandas/io/tests/data/stata3_114.dta create mode 100644 pandas/io/tests/data/stata3_115.dta create mode 100644 pandas/io/tests/data/stata4_113.dta create mode 100644 pandas/io/tests/data/stata4_114.dta create mode 100644 pandas/io/tests/data/stata4_115.dta create mode 100644 pandas/io/tests/data/stata5_113.dta create mode 100644 pandas/io/tests/data/stata5_114.dta create mode 100644 pandas/io/tests/data/stata5_115.dta create mode 100644 pandas/io/tests/data/stata6.csv create mode 100644 pandas/io/tests/data/stata6.dta create mode 100644 pandas/io/tests/data/stata6_113.dta create mode 100644 pandas/io/tests/data/stata6_114.dta create mode 100644 pandas/io/tests/data/stata6_115.dta diff --git a/pandas/io/stata.py b/pandas/io/stata.py index e360fa2a86771..2ecdb22a5cc7b 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -97,6 +97,7 @@ def _stata_elapsed_date_to_datetime(date, fmt): # numpy types and numpy datetime isn't mature enough / we can't rely on # pandas version > 0.7.1 #TODO: IIRC relative delta doesn't play well with np.datetime? + #TODO: When pandas supports more than datetime64[ns], this should be improved to use correct range, e.g. datetime[Y] for yearly if np.isnan(date): return np.datetime64('nat') @@ -154,7 +155,7 @@ def _datetime_to_stata_elapsed(date, fmt): if date is NaT: # Missing value for dates ('.'), assumed always double # TODO: Should be moved so a const somewhere, and consolidated - return struct.unpack(' """ - + # TODO: Needs test def __init__(self, offset, value): self._value = value value_type = type(value) @@ -370,8 +371,8 @@ def __init__(self, encoding): 'b': 101, 'h': 32741, 'l': 2147483621, - 'f': np.float32(struct.unpack('9q{gU z0MPMPq+h|%*viO+;s5_XfByXb{U;@{B()?nH#I&PtQwPmD@e*r$EpaV z08<;1NO@{%c1~qHZgsi&c_kV6R1}sb7L}wH;Z~B7n3EP?nVN`G2|hU_ry?^|4Ndh7 z(jftg!cJj8h?kTzz}1xGBC!jR*cnLdN;n(Yt`W`dh~o$QzhU94H|?cPS?-!#5lmJo zx7b0VGeolwg6L(F%?^rvU;g*V|99V*KmtJUpdLm)tp~x>)TdyPQ3FTAkc?ylNlY3y M-vSdM162|u07r08-T(jq literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata2_114.dta b/pandas/io/tests/data/stata2_114.dta new file mode 100644 index 0000000000000000000000000000000000000000..df9c19ee0a9762757d775f5793760b67a2085ebb GIT binary patch literal 1786 zcmXS7Vr1Z8U}k^-DIf*HCv6#jgG92EIvBqEY-W4_#A)%_3>ExUtaU&PRsxG`2FbWcdI8&!0cPfB#8IEJ-cN z%uS6?2C2gW;0ls5)3GW7DZru+Ayl54nw?V_k6T@CeqKoiJ{5(fiA5!;MYxq@B<7^W zSEeT7RDw?q;ZhWqYDqGKK{`Z8)zDNA&Q4)KuuIAjYE(;{VI0+xLKu@lwIljQuN@9CM%+UJ-LA3mu6 zq+-;+Tjd>^XfM?7`7eBD5AVVKIi}&=`~Uh2pFVkk_Tj(#PuSwg3l>fPhtDjSK6%cJ zX`z3IXBhPZQ|HfH__v=>KQMFNlD~al`-U0wmn{0*PpIE8Ep+DJzRyhkr#DsDT+hD; zYws+YGIP#?$&04{?Z+0+n?Cm+vcNE*^Z)T}hM74}n)A0`ufFkTqW(S&{|IK7?t}Vx z%=qgw|I2Id|N8g;2VVal!ua>?|GzTu=Co0DcupmGwjf?OZS_b{ZagA3Rrk$Yb`#l*fm3_Z0qTT>Dsk`9XIo9lb_g%>71Km-bYU>Yosfdk9EV=HD;;H;4byMr)RT zJLP{D2Q@AGSY5(&7pnYUlN=$pBjq`647fMl`VvA5rl0&@0s3Da0%+mcZ!My#7J89q0)eAviZ<4pU&#~zVoC^;Ll{%jlDv+<9&e%eL-qR)tcp` zCXmb!UXs%I3%dS-O;t<2c2uG%oz?sF@T5wITV!#|?E=Tu2;AwHM3QRF9A_(ZigMK; zB?tP$v1Yy;*A-R_6GJ_z?0u6k9@~U9ho0o)P>daD&t>9zRMsv({otL1Lx$zMW znFZ;rvD&3Cm7;Hw3%6Gb8wb`2T*XT=I=9X&J2piuEf9VwA3kh=;D1{-As*9yR_EQ{ zgGx(ok}57r2z>llVC-L$+sjpmo#2q!29WFgi$4ayN?AdspC`Yc)jO=>s9bu7eAOvl zXmELox84z#peJz5PP2lIm%o=aP^j;fZ*mFx7*-59(g4ccy-Vzzz7ZN8)(gte4;VO4 zV8tGFanP)Ej~;}t3NQu5p8~|_?=+Zl3m*`2;#(oh@=jnn>uK$%r^r}yL6UkPW9Rr6 zKHeHcl4?vDtgd>Y4^?V^BUxScpbH zpOCH*>B8>s8xdxc74@-sj!F0@L?ukt=pm>P3y!(MF{8fJOu4Sl@$I{X;I~#p-rtCJ zih6EV0vwR!ftr;uhM{nJxCxf{1FUY^C_<%kFUas42ZXHSdcvG%sZEt?rIjEjc_U+a ze8je)fU~CBQEdUy_bd^?u~#JQ^GxAkw2sKl8^g*=SZRY;%*fbLWXuN?_26$W-{yOO zH3UWtr*imPV%IcB=$U6tq@6aDVd}x;wnFWw@tR|6KY2Ylm8dXjXE4OZ?BZc%uZdyo0Wk)y z980+hD;n|Ql#rOl5J_x9!+hQ%mdnjNV0GP$ku#wm01r3Rr6*J9mL&A7I4={>P#&Yx#y4K_id@dR~ggx6j9Ox93-C6qGP9O%FakA! zfa=Hbhgn_KqeWC1)0*C6YJ@b`&P0mt4J$@i>4Q#Or&-DGFGBj-A;~qS!>s<^>qS(a z+?I~)cSDG(b|OlxE38;qWXFCWOqd1{kBWfLfWY?gJbSCm< zcZA=_A}4u)@EbMA&X4`TvFW^z%{W*wMqOG)x%Ta7@vQs8TRS**74t^2M#faROV*Nn z{h}YN?7*-X$EV%-4or%)l3mj`|VHgY4GEIZ{3?GJ6Y(~a0DF7C>Hir3EXW3h8}1VS)`n_;wa9 zTLi~WAeISy{HbA(j!lrh1E%j-ostqsrHkF^eOHEz9odmcM==Sw=79Kv@TWA$K$qdj zUL{1yq3}CapJ|Aq%8eehF5tc3^Fw2E^EAa1@nwF0{CJy&&hcAyTFf zGS&u!8HthfLW7LI{t-A<#%KENG9AGVV}z9^;MI~kp)%J2qT~y;YzN0iBIIj;MDyuV zfN=Wdd2PNNVfBF)Kq`9E-Cj?Gh4%J@d2NQ3_DFIdNK%PdlK8-;BVZ*GR-E}Gtf9q% zXey6#qt*e>g^f>|5-EWI;)Gbn!2dI_lExn|9RVHK!{>I4f@4NUE5vf$mA>Zg2`Rms z68U!lkWQ%nIDG6ISjpo9mX3rz?@~Edkj?6xv({rI^`%u2Iude-CCU{)(6Tce3xY(< z1jq9ERQr!X9tNzgFlDp)-AiJqRNz6|ue2hyc~*c72c!!e3qkma8f4)9k3pWDsEY+w z46*SWsMMn$P32iK-o=V=Cq9B>U6JIekcb3utcV{yJ`mv##p_$JVsv`80aMJ2hFKX% zy|oVH9|4YaLsv?u{!$H6{97Ozd9d1UrW{t+bKXYEtRG0f7-ma`$LWaF=VR2R2l{0; zXgO62zkJFlP5NxAx&orVbbb?+8xNtGHywz$6Z6G+6bRE3(l-yhT8bne$IPNH0_NP{PKA?42=3(~o%DCGb+wi@hZF#nTyIeZMr<7dw8 zxDi&2(s@{^r_^&(FCwF%+5dG@%Bq~27bX_ASlHU5IAUB^3E7T&T zMWZVlH1enX^eHSlnf-6SEj-E^TmllQoJXnN)q&_gsz)r7QT+gm)MGsqMvwWQ%`aX! z7IBTo`fw0djJZd4Q00n+CTwj**0n}mib7yzl!be~9<}%dj#cnu>&Bu(oy|TrenrPv zUC>Y4(f5RQ{ia^%JL07v{W1ww#(rLjo2>KYH7y zPIx%>xuA@h1;}`de0?K6b_B85ncKV>hwz_3`d$&n{ZbIiXj+-lmN=4E0(Tdy12YY!+=h>(BmC!lz4b(p|AKn3f)&G7`&}3oW9ayY zZHeucdVy;ahFE5xUv{ASyEH35r%u%3s_&Hyi1Gc9-LNu_hQDznoxEP5lncP*StzB9 zjBNqO-tZOfpFsx>!^-T!JXYt}F%^SvB0U)AOnRE%2+ZpRa4Zy7cB1#UB4bba_J==1 zzu1DwQAK&I-gfdnD$o3kHfr35)Q|r|Q1TZeV+&Er6b!pvh~+Nd*GHAX{g`Hp3-VY) z@T3eXul}6Aj~qzCl=p%%Cmb1Df+X*OW5H5 z_yOUU@<;wqAgHP4u5IPA?wli1lVE{3f6n~F?nMoaM z1Bh)u119oXSc$<7lMRm9qpkA!u%9O($wo7j-*kf2B{n-sIm2u$@Bt)dvVm|PB4K4C z>XM5VYpp??Lqjl<%;^2zMJI3;8*-E~w`NhFSpg(okNILB2`gK`v3v|Z2OQ&$@(y;B zQKL@*Sp$e6bHgzz^_fe*n(t4pUo;ZwM{FBXf<=-FK*gS#6+2}zWY8C}WEGxZjrC1( zapIdxhvyJte;$xKQLvH>9V^%7=V) z-c+=k6%?kbAfGinjm)9Us#$bh$8IERiw%)aM#1WSXmT00qwWa5o-dv?4gJzt)o0&) z)|gp+j4~;6sl&zoWVCe?qO4jENG3|Dz_B@qrIx?>WE!eJSk*E9Bw{&q0#j@rtvD-^ zjrW@p#RGc)cLWMkiGG=+L0o4{NBE7A@yv_d2_i z;!`b&axfN-6r+%6$SMdeNI?j%z~{vJn|OV`2CQsJIT0j^=xp zzd%_#KnHT*m{GqBjukASQ777yw2JnGJG~LHoCcHYK>qjWeOL4R(KF%LsKVM&w&p@s z=Tel1lSMeS@pC3Ok2w+f_SfkBvw+lVS;7Wxep)gUj?LoLjl7W6zopsmg;S zJ7DLpjR&LzwEP21ZUT}xn-fRQfUa z`F$JS7(FO+tvFqK=j28 zWt5u~O)I?Skt1F`2=`8gmCF{{n@2HB zzqH2%-mg&pTl|+ej|)9)&q#-n+nISBmB-J>QT7{v2PoS zSmKH}uCMtNVi|XX%28XW*NQnLq;d#hI;6p|Y74iuCm_X|W9ez3py&{k(jQg~^3q?Z z+-f^@ahOj0f430nZ5piHu*iAtfb7?-bQi_aTSppgd)7bgX-=a#(EZgANGC=eWZg;74Y6q^>=8(-N1;Y5JftGhcOK;TWdu_Z{C=1Xp%g`_P z05L4>b`N(-Ni@nZmn?oRK>7D0{2Gg5>jTHmY2(#>!9vW+Xei8kK#cD852=hTHh0BJ za`$JEDCq|Pd7wdRN9Dr*e7@_ug(&6c*!f%VXK+R)zfo>#G7WbOBTJW@a?hant)PeOSMQn-9XINw1 z;om8@CxsSNMG%)W$k?c{St$9bRK_})m6NBPNl)SC|9_K z#1xwdHyeYHtH(lO!Bw;k6ef+|n!5z#FX3^01Bl-F{7cF`-$fkqDsmhnl7y*vtCeKcjO?Jw-|f`8GDUV?&o_pU#1=3 zuv-FR+)7_ld2TA*S@I<@*2BtRY#Yo6Y~!d=*$dR=AfJ=I41~FYeQdSq9IK1{`7M=> z@1+rBB}sVXN0da!dJTi83d4xHimFz|jtoa#Vi8Lae~#6c-grkPF^$@cTS@Nq@+aIS zY#Xcef)qU*vD9mj3(m`-W8Z4zKZj=))$b_tAdSu%6HY2a;LKBX+prS2=K&-r{x_}q ziFOeP-xf!n) z`qJTGavn-)EifVDNT%O+h~+x(wc;xbi;p!Xm#~JBZivNtAFU3bMqF1S7Te>1G{&Li zGxF!n)eSyq{t8Vl)H~P`oRNDqV6>*uM&5IYXQG900v4128;sVe;8-LayUUjjSP2~q z!0Uyk^Q^9}_B~~;?5EG3%^(v~0clf!j5R^VW~2JcP|8|7vsj6pc{mn$FF^F}<%}p- zX3*flOUc<#3t{34Q5Rc;9|{={!8bqW=Py`=-uDN`a(&OUh6UaAqLO=%rk)Qc_tFVb zB8!mZrm(Ucz3;0*yy{lL(Fq{T?Sk{H(LT~3DpxY8<=QIZc!oeOPJ@yM6vGAa&{9_Sd$B>3V-C|XH`kJkJapx>B0!qK${Nhd!J5Fn&sh!W>jzP?@-1cc zCzFh#GUz)x>p>)OvKdOGDW?&Oy})c*hq{=xjHPy7gHpcMp7NBkhWPI*BlR3Vi7*CmGb9pYtYDM48E=>OIc&fW~?Zm%%SN&#*%f7h7cv|9O}{n z=a`Ln@Grnh1;6{{8pQI_gw8bm!0HZ7vKE!1qtvE)GnqfSKT&=?i|V(;g~ZqB{a_7p zHgGM{(E)^61BiZrkB!K5$fZG_$s{>|!*W%MQaT9C<#_Z8}uD7LYntSH;0zoyR^DzU zDo@W*ckw&2^ROF{T3<&gI|+=(I`qqbGzgv{VgENnU>i&d`o+49DF1$*9-sL=i3sXV zI2JoiO(#4ZjRYxlidk(@CK}O79=qW za*+aOQuzEue?rs^R$&cU{o0F+heCJHJwtLXyWyK}qAt#GEDEJmA3!AYc8}G|qCT4R zDLB7O?I0@aFVXx>CFK5AEQ$elkg+a;5)h4rM0G5IcMgmJWG5cjE#oh;y6m5wL^-LP z&TUgpoci@7a@c)jtc$>{jn=HF_ZJV-VxVJheNVsK1c<)FuuiCcC4HE1k(6A(23v&H zf$IuQj>g>o4;+i*bL}=@yKsUiIRj$oG1XaAtglgV%@yJm+?z0iusYNLQlut19^r4| zr>xolD?gdkN1hj1Uo* z{3TX*EVa8Rt@)XjnC_6JHK@zhr?Ao;mn$)l@i>$+hCl8d3y3#1r2^9>R^RVw50U$D zi*8`A-$Q4$YtN9!5#0D`w@Kzcla5Y z+)I#4W1wS+NOCma;1~xckHy-P11mc=Ad9As=2pHLBpi5!ZDDNjt7 z@$;yMeMM1yGNK3xPsxU>XzWZ+VLed_?Vh^ksMtFFN@*mG> z@a}tL<9`MaW{VMy;Rj?zn=vcaM$sOHjhs* zY{HFV34W{4%=bsukn8CsGAZxq`^$GoivyVZR~m^N=Y<_60kK@vtawk`gp+u0m2a?O z3>e}iDv=FTzwZvY`-dNqdSG@Y4G`oBNwAWnS-JcDCT!zWeK1s5!Rq{8d5O}Gf6%q5 zcSx))7R9|RtPF&}CWA1Wv4iFF^YxqI*gg|7SWv<0D`pK8xi*YAw*D?wxKV^T(ij;7 zq#*@^Z!sJ@$2(2jjEu#qzajv{aP6zXqI63qT87pVv1SyJzik4J4Z?ic4TT9csf%KG z-e%OL2gYk995Yhap(2y07wz7>AYSu6A@WRHBzdsF346dXNsA?b-J)qSe&SGch1IQC z;Uh{{4PpdePa?XFB{GZIDGh;+rD;~wy7-UU0xPXy<@>@btX{106_q%n=pW8dm!NTk znPVr)j`(fDrhS+%G91h1Gxu%56q|wDo_4Te=;F(Z++i#6NLM{gZTlIKCt(nl;qhkY z3|P60j2+|`TB|qM)4{hluwrz6XvU_*it~Ro((clDB7fc-5EFhElYyPz7EC_KUk~02 z!dR&*oKGdITXanjnW5HVX(~(YMohrdH`R(6j|lgp_ucT#`}w*1wql2g#1KDTSjpn4g)w2j#Ja3eY} zDHsb$b5!3Ca&Z9CHyOR3!pBe929C8b+f-#2RkFsietsfz$3|?kwh2wUJCP`ZTOi4P z0=M-b$iE7)B=ecaw!uo7FY3f!#oWI%3_4~j#>6(IIh`jH?v(=^^GA{oV(@JOdlMk4 z+qLTB0fgx)tG^yI96L-?@x>=i=@G9iCy zi;9h%=svLtb!H|J=51?mY#7#_14weR2DyJ>J7W2vP(7MnWsS#L1c=g-X5!*WEUiw0 zmFI26XInlJxaKQD3STLQ)^V^BGT}B~8eS~?DJIOn+D3)Ql8dr3U)h(%pmDVjp_da@D z-p$4@RUE}3A3lX6hcJ;#z~p4Uv}Om6qWwY3U4_?Jz5TbNuqU?=w+8-!(&5bF<|Mi# zPedbUX=#R{*pdSk*c zk=fQvJkshYeyO&SFiV_7`(RmZ7nwi*xZywMzZW=7;-CMb(FEf~UoWeJfBrwi*yi`u9XZuk$Gl#x3D+`=J(H>V zts!wSUNSxtEy5k~_v?Y0Jf@RJr{R<4ck1iz(XW%c*Pwo0%)7Vm|MDkd+NAma^`-yg zf5R3{n!j+`-+X5Nv`Mq4PYwMa!RkMnKQLw9+yx)~g!uzA=7xXteMY627ryYLpD=%5 zYUqrQzRyhgyEo0SIUXMetM4qFJY)9!Neica^ka+WPMh;LQDB(Rd4Kyh!_1f~&Hm`u zo8R~|P#+J&-+~#Y+rZxL)BpXM|Le8aU;h2CumAF)Zh!svkG}rzPyP2R@P733z5nyp ze_s4=KK$jo{+Eyb`S$J{6?;Cxqi_0BI`Zbf7%P?cpP@dI$-pTk$EO-iCdw?($+cD!=}T)a}?H z`0aWrDDh8-Qu>>bSz!&9g<#pwd!+RhoP+xC9i!s_(UX?FsnoxUY&^78NV9w<$TNN? zXI=urSPMZ3>%fx5mrdq`Fz;C4eAtM%^gL6!Em$)c891&*s0xHB(_B-I)@&RXag?V>`8 z5A=g!&3rhn3s`g$LOrPLb&D|WTZA=Z9t-m8mxN*Jj8e6=;B95E8s@UbQ&37>`AEBr z{4`cy<=lr#F}KLYJ1d3t1L_2>{3RKcQ)iSNn!=YBaKD6)7}{U(yCWOmk6}Nn@#^PJ zrSMy%l8Y7s9zPZs+XixHsTpD;*k#xQa)W>IM}M%C=4beN@cUV<-3pG%C3neJ9TS9i z&QI~yTjCt_1cuqDEDUp-zn?iksPCC)a1I&?7G1W~pK|x^5gW&Egm*Rdf->X-vi}4u zw#bW}%F-=nAim1q5R`Bl5WSz{Aj&OxNXW@=g-p|1f$5~BwSi9&vF3s#bw|X`^Dn%; zREQ*18`4=#)dFv-)c!^?JMTlfw1oRyN9{6xH%d|qVN_0UHDdXuJL2-yxV5Li8gKo?$Zi6S0G!Hwzy*$(c`QbwvS2s^mQ( zT_V$j-QQahW`hOwwt5a({s}P)CUewa*Pl_OR(6(7b7Be3=#8&MBVv2OSkwOV08h} z!>AnbhS)UC7JB41BGOJP$}sg%avPyGaGYwG3+n?GUxPX52UvYXGk?l$en%#dW5Vh- z@TJBYEU#dggOK4nNe#00fRB(8>d4z=tT7y9HBa{hQ04+dpPfD>JeZ{?a&9v!C%rN% zrEQUm5S1mz$Djt;%zBX3PHp@t<(lYd;=Me9i_;N#S4&#ovjLFyLZn@=S}%$B3{Zk( zxU;k?Ai9+gK7(K^=<1LnVf6~6GOiUp^RfYg=?M2HfhC!b-Nl1tuK~w)K=c92$55`^ zf=0eLEo@I^h$Pz6Fz+|;r4tMbRU!6XMpUeh_q+3x;Sj6QpZ%OFrHyF9rfb4E<~^EH z8)W}2+C^s+awM;m?%-E`Z$$0yF_;B&h}8xa1fyM8(ZLDV1ee9{1tzU6t=#ht;_rgq zwFtf>@*9VnP&=K?et|`oK&DV8)0$Qc{6&ZhLd0r1P>%ZpEZzUj63-9*U;@h&)Qbr$ zdauGSsghz#+Zk>N`D`O1J9UO(AHd=a`{R(8)%?3yK^2eRofC!(R&y?VF7ncX4(|9& zc+ss1ky5+RoDLe3QtEenokC+A9Z0qno=ved}*z4@lGqk>3TviI+IlG0`(>%$g}yGc@!O{J+dFq zA7(X`j}}s8bZdH_sTNXQIuR+R7g+RQ>5WQUqq5}n6VdwGBFNQ-!>smx!$K-gYC}i# zy(vUjITEGT1uPaO*`Y546Q)AMVi&!=CZ!8%fhg|&jm_0EL5he?ps7%YPzn7yj|22VeA)+C&+Fk~Vx9amDN zUne?RS1{n!o(1kk-2+CRl&CZ)4)JL}=_J zd>PLtoF0nSu>q~`fZ+(MQIeymbg3JC;KGnGBRUZ27$yPN91uSU{%YxGj!2Zx!Il8`M(#ue`rUfuLlCs5|E*g;$;;QaA+7Jb{JY)1s0uURW#)e^q^Mu zS~Bo==1Ywzh<6FoLBWf*b&mRKS-cKA`Dvutb5yi9gEfTFj53a-b`1 zrlPFwMASN5q|&FHV6FOb@iaxzN>O1o;>raz7A$4MgW(3W(0q zbrZG=Z)*3xHMwE)8jIp*Fsvscwg`{}7?Z^(FCB%>xd45vCm{OZEnCoGOf>DhJ?Tgr z1m@uvfb>FMB4Ph-7`C5xem4pcOGLy@`+U!8Mo-#KnWvOi&2S=Q);ocl6#__aXzX9; zmIZJ>nU9|JDI#WPjD8eY_&uxL*j1*|HA*+W>`A<$8U$|7WI)_aOyLU5%4+0g2cP}$ zQ@Fp;Y#9GNtJ9B4qFkIvgC}$*WzQZ9()lS!Wq%m966$3z|C4w*Vl>3#Ys~4e9xQt4 z0$A!P_1Msph^^}c>1zoQ8;tC)MpJsJDtUX|XzVcEjVYmSg~u@WXM<%hp*fcv$;xPK z<9ag-kGw>~nAeEd9)4!fXXb6(h}8!W?aHzwD%TLY#?q0bwSFnc%_o6{nxxbiROLHW z{K-Fk21ZAt-<>xF$5@@S|8^?pQmS>aBic{u;mags-`^y4UyBZ-#r)6W7cCeAzs6yG zI0zPf&e3gDxoV<`n_H1Jt&x|)5U>QAxaVt;i(g<^IX|Xu3@X&g=xya&c%0P){j?Qz zPiU8K>V-bTUkcJM6TvbXvRsF6&ICkf44yC+?x&-WC;1#_wXq)(u_*e|J63f<&6wwc zGI}N;<4p36_4wFP_+n$Ue?1oNKSAqzQE;5qC9g}OQsOXr!a+l{v!F5O>8O{_k^QgH z$1WiIjf`UAIOL^V-T9BBkHvpOxs(31TdJN|w|FBkD`vo#i72rRP;VNtuQ!frGaiL} zg-2x;+E)0v&bX-jvV!f$e z;F^TNm+7dNZOHyEmF4G@394W9J$C@2e?NFPSjN(b*AAqk=PRUgK9oEYsgx11P0-kD zzTE9|^nt@*nN^U>Y8*PGVDL?#2jiVc594crX_ybgLcy{VwZ9n=d&;*v{5k5y8cL2X z%w@IKllD=0#^=~_JIdwd^na~s@`X|^aom(>MNOsDe7 zFX;QI0VGU$FDSDk5V3Fsc@GRbfgs=H=R62f_Zghc47r$PeM!+U6!L;) z@MS3=`_TH%s~U4$`UNC<*DRQV6RgJj5gu-f%0N2v5(qBdWc$eeBHmXRyr{%Z8GLug8sYPUSIH5f*AFm{aor0@i* zlO|`Okf+nSAU~34ttHCZRd7EBC6=iUK66hFp%YvODuMnENILYWtGQ={X7vt)*I3J4JTR6_GZT@r<;WZ-k-!y(h=@M6j;_HFF7c& z)+)p)Gz25bh}!Q}coJu^!N(|bdnWas=}!{0m@l?bVA%wXyA^Y_Z<4wJU?>dlAcIEV%v}sO_G!kDfUoVY?Mi8gFf&jv*0AFuWyor6W<&< zESnJ93xM2>2Fnigu|h=5ph6Zcm<*N~h?q-Z9;?aQlY?)aL;ZaP62vwpvM~m!+yzJh z$|?xmvV`B+YzhQ31Fy#yT!&tr8@qp~TpVkTYFp)1MUWJTmt(O}(=o?MFUs2kj`=Zj`eMZI(~_h+9xR-aLI zoHEICsNJQ0WK^RjL|L&GkPM_!fnl@ZOD%uv$y8*2khzcXr{K$>lbB+2Y56&ktbfpy zDDKz;xTEMW6{weqD#T^_G`QawK~4um8`S%R3JDtNO%gh{z@oSo5j$?;7E~a8i&TjJ zt7!;wFJq)#A|Semuso{#I*)$UY$&O3Xim7nafsLni2oWOGgOE}iiAF$y zEOvA0RKZA?wJiy^+6>7d=xr?@=6#pXXEo_o$o|fG^nNEdQgpf{ zQ4YqzuzZsee+#~h1dFw?J&-I&j6w>Qe6Tx2LNiCoD=hgLZ9!~=2$dASW^ z3-qm%oWh*)>H|AQf))zyzWPPtFfh+gy}8Ds597^c@QfnoXKH2P#al3L!5aA(%Tmordu9mM}0weMn_H);k93oNJ&v^Ey78t1}X zoGc=!m9G=Ib=;B2cfLmLp97>`jS@C+b}RrndoE7u_z`3qThb-9F@nfrlY?JAUjGs5~a;nxL<0LL>&q9w!x9KkuhQNEYyAq zmfldlkkvS@D#5Kn6s-uANJ>v9+{0i(CtboqVnO`wm}~zXUvh00RK3-FKL&`l=v^u0 zCdSZm&$;BNXLrKAmBDhwBzv*M+o~g2p7QqNXJhW0@BOL((dECnM42zwQ2&Ajq-;|+ zqD=i3kRMG-tr^lq)mYW*+30BtdGlB*#BP~>g(~i`bbY}*%t{x+{JInMQjSzMfiGG_ ztcJh5cnL|45Z6=<51(oGe@kvtlO%a}9a1!EKygg-F$Nz_P=9;sZom zH1jHsZ}D{X^;sm=$%DvycY&o65o<|OLhq}!@4wi*XZBE2S;Xr4gcr;k_<0jsqTyX*|{(L@3*9;)sNX*XCD%49m zT;TnR&VQR9y=UIP^TP0_S&dD26_ws>pu=v45vEapBKf4iup4l{6N!FQrb2Angu<8j zLXPWWJPlvQ-lTH$ChECtHVLU1OqlkmFs#bNZSDa`k!o04YA7T+7^(CFi%wqr3zb`K zrOtNKh~MufBE3lk%T1G<>jubvm8F{)h6?Qh!MFpWzqRfb<=$?k?dDA*&weuy#%>>c z`2~gCpWtc}5xc=>UkJnCTZa;hH=M!Ff8jP&3b)equ-W8tGb7>JrGe#+Nq+4`I!0%y z<8?JZAHHmXVM_qf+PU7N(&=ruR+~*Wo)QS-mkL?lgDky}m+#f_TCU7Ty(~e!+y_Lr zxa)n~DJ9Wp-5j#$xq!~UAMRJ16l-r7c3vH?Zu1vlR>q*iyaz<@W?MsLY_U1ZmXmuw zi$qB~0LViXQX7~9`}6oNZxD76hGCg}h{Ynz zm-W~!^ML=vh`@cldY8 z?MbHjm662xEFw1bC=7dMQf3oq%zRAP&yVv8N4>;i9i7XcV>RV3o>OViF3J_GBC$mV z!p*`U6&v#LWfh%!mNpKR+f-kQzUmzYy zt3uX~U5vrE0jc~F5M8&Euc(r|ho(u($;=lN*B#mLTO;#Cwr&e+e|!moH{D z0bva|@9(8yO+)bUp+qv}P}zY6WE5CVsP(cV_D0`Cck)~$>Y<%)C~ zJZLdF7iuC*d;#)e4fjLQ#zXMU&-r=tSD^O&ps^gE3#@K_H?64T9Hc21BFOzTLX@aN z1i2|#mZJ83RETHY3K%*bg1M7_fz{ha=|tsf1~pw@K^)E!w2L#4r43kC5L}_FhSiK& zsZKFm5DzI~HNO|>L^<{_9eryxNzX-9PAvqa8CX_fRt{1d?ECDMXnlRrlq`HoSna7K zy{HU4LT5gVB92x=h&1^Oe6ban4Qr4WqZ+Z4POFg02K6aV39FmuZz0Ovvgm>#&R%GGS4F~EpU!m zkEd_~Sjzd`FIT~rmj+a(;RjZ8XksH#DLh84sy32&qxuo$*K^2zOI%2NjoJ@ZA?E^C zBOL7^m{ovi`+Hl7O#2)f^tnuS_;XmUN{~uBfw_`^dWnJi=lRRWS7VzEFrUQ$(e-p{ zBFbBHY5kUMq<-K)!WH}gNGm*2OMu3EH!2iLG%b(wHkS%mL^Aud$G%tB~LmW=Lx^SRX+2ZC5rEm41aZ zu5>4=vKOwP%OK0PcnBGf+TW}CGP-398muo~>-bVuQ}i!ek$Y7{9ezzF^;g`G{c_ZP zdnh-an9uv>VbNz!3|J&Qve{=SWwoB~T8K)|voy?WKQ2|=iDGvR-O^D|tm7cd-6|xc zA_haerP^UiSzVN1Cn}@Q(R)Ql$eccIgnLkl>~}z3%(C32w!YwDYXA`dXJDAV;!Z13 zd3v6@iATuJ!>&YXeFLfNC@}78P%q!95IjS|{%=HseP>WmFOBR)`S%O-#EkDrWKb`{ zvDj&+_I$EYmk`-_^?SoE0;+u{u-(QfLlx0^Hxg zPhPPOEI%2{k3282`r*C1ipsbuT0FLjeAyVjyr_f5x(c#aEY{I@m8IkTbr8^NL&xZL z{AE^iJf)i`t@@c38}5?D)yT`{r(o%Z%avHP@pz;%mOtSY2Z$ForF_F>R@?V!cai&W zn{Hfmhv)%eI$_($>MlrQVj-A~D&)-GIJLvrW%K}xF0Glf$YtH5!5tou2a&xAcla5U z+*6QCV$sL8BgirQJBN5Ec?{N`Y_RC{4z8kHQ%$>#dIG_qD!-~nA9Kbd=y-IPRMg07 zzV2UUNPoav^H*4nTf8gU#c#Ce%_CAi+=XxlFx0DiA^UM~KN}HS!6%l-Vmf4Mzf-Ho2I5)mMx=C1wWL0R z)IS!!98+1i=j+uC)-L0Lp^ViWj&c_z?KA2)<`KD=&5PP(#jgQ* zs6I$7W3`$?V5xmZl`X%KThl#>BAOu{f)xHWR)xuG_T6@yFC=X-EcSp!H{9Jrl>c~6 zgLmI2>%Z$ym`!>ZhF^0OZp5tY4a1Uny=4M|T{4UnHO>&jO^#@FS~<7|V%LdIMdYev2dw^2RlkrKtFGNaYq} z|2FFV82@g60t7P+*MpA=e#FlINk37ldrj-c-6f8)k;tF0BE$5-B(Nl@Ecd?OfNh+b9}E?gvl`!5o}%>QA9Quf zT@q)FMR6|+mH}w6J0O^i*unDndD@LIY@Y!U%r9rPV2=X9-6ZSx3lIn{;yGiZI_=!W|RaUcX znYSoi(}|IMJ&Ei(hR7^tr!*LSELCMO=fy8@6Ifb<<@z=D>o8c&!l7`Is_m8EhIkK)~fhxKMiElO;Qp@P+Y>ne&;jFs5uuq7Rk z6pV$WIkNAIc5wi$ZxU)hnNOI!1sZE%w5rT1tYGz}eSJmdu9axNx(Q9aH-RXFS|G^2 z0=M}f#J>W*?BFwwZvjiG5Awuc!`#0-6n)HEjE!qbvpY>9+$%d6=7%63#NgWi^(LaJ zZdJ372M~s9toBCiFzhf*#TTD8rAIv{5{|Y8WSAgzKY)cqRw34tx1x_NLny`rqD$X4 zOjN9FM7Ie|s1q}hFmGBzV?(j_96*qFsE`L2x5Aen3d~2-Ypniw3x81xZze99$kM7L zusm-gKHKz(z-1l44)d+rVd{G(VpaH5?Ut3S=9|NSoV6A6hgwj}BSAQeIf&7ZhNCGR zge-R=VjFmyuM)8*Z&DZdN>=-Dg%ITj2XW09JksP3V1?VJLZ((F zVpVWgw~IF)v^&f@= ziqeP{V%af0F7KujWl38xXBi&Vx*miq&8u}1?qF9RMYHZHwXNI)ZxtH>L@zz zn1DjgRJ)~F$>rYL0cm8+@Vx;Rozyo#R4&XZ?Ip#eX4YrU^ z*u5Qj8CGb%Sh^1=*q4tYrLXN@LWEN#$O7+dB%^XGpy|N8fH|Mk2EuXA28TJxX&{r7*p&8)v^VWGZmGO7Q}YSv%<{J-D# zT@8F!&#ogUFX%eJ!*gI)5AUG^y_mOe-u*BCMNXf*;NO7%_;1AG$qN=u|C`S&m_B*V zjA`Nj5v=~v{DG18z zkJVjV>Q3cXKas}NO@jZf=Yo>(h$t1m8JSf!5Vl1Sw*9N%051|!GW2u?AsDx~bdKseUYm*cv_if&@KCzZW#5XNJhux{)_L4Ng|FifLSspAJJiKqwbC1pE0-$9txLd*kC9%nxOc-&C% z&B9ksapALAU1^Y!Dus_o_oz%^_ji_r*MWmh9lwq2{@&%Qqd8$2ve=(`x?|Fo2?#XwbPn?M7icVnsld7;No>e-qnUS z^?3UjTKKN$|O z8vW@{s8V4?6SrIu&M@z=2X#R8-#{+9p^>9_r6QGI{hblLzsF!E%pq1AQXC4ou%^Qj zuL!P7-w8}+M_RMzEy&*;+O-(5B=MU^n$SDl%yGerE|E;7OpXn$9`cJ2AA*e4ccC2j z2dwn^mz4y5*n1PKOhvz#U`6j;@)=dq?Pw>%4WWp&BC>NgIQAY^T;P8^>av!98z-pb z@xOK4kj-k&ESZP8w5G$Ho(Ru+HYZX>cbeZtV^S(y@y%Pou}Hq?y@2X}1A+a%D4W&3 zHO{9@pElGx@1amsV@tT^J&+DS0{fy9x53InJ|vN#E|z%RozG@p721p zn`Tdx&b_E4=}a8w3c1^-Lf#FefT!}Oo@e@Iv-;8J7C@BR(v`~_g!8xB5>D3#gwdI# zI#;ka4N0EE&nl!)n9itv0)Lp*)I3;3l`-w;U8YXRaP3N@*uJo$hn0Tl#C58b!hs^B zuOpINXE@Ai@4j3_<;flBr~%i7m|91oG`PZwg-Lc80K!D55b>x8_zVbaKi|WwZt%k; zRCaNo-qE)OQ&?9bZ+1ucmL@sH3xwaOLUw-W2aZkWeXPg9iazGzGRk%8NK0qk6JFcG zu`8H2k`*#$mb+9n$=5FU!O9K{iy^)_tj6o{7Z6xS>aeIm@VwfMaCa~XQkucaP!MLX z8ot5HU!66HC#wxPsEgBTstoK($LN{_|2SvDTpb8Va}%@57az+3lf(GJJb$n{pLYpK zEXrYZ-L-2dGo(8$w`>$bdUqwtKre8t1#YlRP|FMz;uA0eso0E+^@0`s_$`3E>`seT zJ{B_jIbnhgLin~OE>{G{P9T;EeB#Lwkd94|z5|ADS&fnwO{EJx={;A5j2+d5NJlXV zxK@DpgYYL+$Y7TMWUm^cWMBL(tIgKMQ000r+8Fpw@cE$=k-i)PNE<*#fQlDYNYJ4G zWb81wwgy&ombEdIJJ6e2x7U(k_W&6+9FVpkOaMZ@3I8QNXygY-_(3TDB|!A;hsID@ z<3daOHVJY;dm?4~AY<)8m{Ax>&s50xYaf7P6@0eeF2fP*FnU;N4qh#36sq&>Axgeb z%Z_kt6hgiVNGzWz1q!F0pHt_{5mp;y0;H-h-R<>ASZHTQn3qOa>4YQ)fh5(4C4~=a zF%njyVa1t0!s=Qth^2C{8?_32Dr|hzf=EdO5J$u^2L7Lhl??uP`AF!%9zMTI3>?!t zSRj^buJk2$S4ivIg2=xMfH2E=3WP|IS3pT@_`{s5?)V$qdvRr06(^f4?t8vSp*E=zBuDf7K-PANgF6ewhR-V?fId_~slybjHw$;}Cup8hNts zF;*M*J_(DWAH8MWDAbRADkx)S0W#hsU)zX}9YHL%#`dqqA^b;>zGub9SY6tN6e=YJ z(BlpoqMZ$nxy(Sne1htK2^~9!>RTDbr17XrmAdmEgN`M9MY$7!v}cB%*tC8vFso)F zmPu%_O<->(s;@VW?l1w3e2GVA7U#2?QEk70CFFAcGbrqyX3hy!^caCfjeFw;=VZTMIw!hgy)Sxp4_&zlcc zu%g>)w+q8!3>{zJf!J(m61e6Oh-C))We2LiOSSTI`b0IZraq~F=-&<74J+elA?hN(%bk-U|ueOW8tu} z6TQC`8GFokI{XRx#Rg1{DJfvJHk0>JdFCh7vROaUH2x1kDO`+(kH02+DW3dFJ;kbRK8vnt0NmwyV1-Zm4a_&BTac>qYCDb#e+i+J|= zL*R6)kg=7Z<$lP;4+y`UKk|ne@dCNg&i)G}%{!e0v=I|Nax zQPuL)_E0$4#po3CLCJAeCr!ykBhR3XA^xP$MoX0SYY=`cS}aE$d}d7!qm$rdM`O(% z1`vH&>38Utnbe^nkk|~+VIr@El{oA$x!{-`+NzL`_<0hNtT#gW4JTMla?7KX)6K>L zA4uXR>j?Kg8df%nuh8RHR~Av zJz_a@0#j@rtvVx;jrUp*#RGc)cLWMkjeeP=LR@D|NBGT<eH+59HA4y^NhGLS?{;-3r6=1E5kwHxSE*uwr9$+B*ZWIHO;B8H-qLvQ<9iqQY@W z@LH>8>eOKfB z(KF##aB)Mhjj@>3xRey&WD!ZN{hY~-V~#|=^(A`$3?NNvmau^vAD7L9W3zbkMqbQn zpEZ1slf?>pvBra>+GFQ$NCcz|wEP21ZVr+-8j_mx7`9rIT~4m`98H)SC}MG zMj3?nJ*-p_I)LixwHH=Pxdui@LM|d$xQr%owNj0DN0MVAtx{Tvn0$qD`Hi>igB=X)}uu_AJwIS)@ch%kxTx#Akd#j}^ zWp(`vYp`7;(lDQyB&N+^K+<648kE01Y3lPUW@R1U*m*uO=7W=X46NwKPr63AvP4?; z$pVt48BDkjF*_@2(J!5Ff%hwv|0X|X&-{PYh2c-K8rvncRC>LM23(IIjMX3_`KH6M zYY4w9iFr_|LhL$(BbI~`j_YqciCD&6r*g~|>a}7H39BAPn9dn+tk%SB?F~q&>R4t* zI4C*{rSyjtoxJoHD!1KEUF@e5|KCkSdYu6)*G+PPJ0Sa2D?P;sbZB=F#sd)jjSV*_ z_hvioG=DmI@|%G$_WKaaFKFaJ1Xr8L*fl=yTm%N+2DDg$;S_fMb2q6{vYlo{%pn(B z8VT1a6IO1SBLvTL)FXX$wS%^}8ik-hTe;Q|G@*Cx*rqW1< z2(omEktmlAA(r1vxI@9caz2uLly8Vwgo(BmgsC)~#t!565I4PF)B2_*B!7j0NDB_b z%0pCNz+LP`IF`eQSuDnU*@)dz4=cLNRgF|Ry_4E*Tt>26`Vyuh2T6Wp;;IDn%RLqH z*|&?;bD8-ldYaWY9{!zjd(vo8O%!oCjf_n@0>_@1lsN<(GanQ7^W(jjpkLy!j?UxH zu$ro8PpLF?7v+lAkhoF<;bvnHa!puBOt^}+hQegQ}FCdSkRUsS4Eydv5gi?M6h_2^}7gR~xLo=n7WY#l^>yA9c@*0DW zAY(64%Kd!rR?F1m8+J=T^jqmmD$h-)JIg*N`X*Qzifw~=k8K<^s(6OF9OUydmw_;s zv5&1boMkofKfk8Z@x3&PtRzVf{D_haS+8U8RAB@$ucBrvV@E}zF7b#Zgg?t_%dfwo zl9)lQ$E_rH`}h;?BDRfHT0x2pKrBrvl~`+I3bi!gkA1d&YnRCZti84WAP)qY7${~VBHgZUz_ z3_E|9*HoUKL2Vy~lKwOROfEnvtpp})9Le_k7O`C8y;gjIVez5LEw4 zDmWGm$L{dugH}Sv0`YpG;T)@JYMR;MbSXI#ZX!%VG3sK2@WUbFVff~!{QL#0(EI-2SibK$R=1$1 zR#fs2()4qYb>tgOMT9I6WJ zyS&wqz5x&=3*T~9`+bUDREB&@XWfq`j@HA8H02axu@jh0>rfY?nz8h*Yf#FU>QkO_ zRyRM;LX>;v(uJ3z$vWGiL{?5g`dZ9!Qj5 z&!GBka3S#}dOuW!oC#WsbaVz`)&Qa%*L&9l(!eqrfoY&(~u#AEB*nHws@qL2##%6HF;gHXb{E?uNnRa zoc9mfh|;VR)IMV;X>{WV6Rsd*?F6ME5zqR}hxc55cTBWOOY>un^Q@+oMGH~deS+RH zrIDX64?JtnfOG$sSV*!|Na%4h zq#Xp-7Z82N)h$J3UQLMEX1_o}gsX%h>9^}}l&U%_fhKerRP z7p2tU*EG^}$sN_NLhpA5a}$X9yl);B{pZHQiiAft`wSJV*6VF+QR#D+c5d!1Z@)xd$1`;M1(vgD`1$Vzq=9F@D zM4s$Un1k10rK=!`F_4QiIFrT~F8UK<-e470mouP~$apAp_uSJY?~)t7`3CCZ498+n zO7jDVRNnTX`Ld{=Dt!vhFH<{<%KD46a8nt%w-t+G;B92Ao1g^7Vj(d*mc%;;#R0Mt z5A2rl7g$a1&yJ#;Qc35wuOyBGdJ{R~9x~QV;MT^fR?PPo^%-%{vDdz*o^Jv~+j)c& zs$Wg(lP-|5^VncZusU$vp~|lPx?WSAMaAkW71vxQUZH&n zGX$$c9UvvBL`THOJC>iqe{&X_?_RSz3p>Y<&zXJ#o1b2N_R5DdYI#-tmBV zV^bG&T-|*{niN?TuFz#y-0@tmraZx`dfXRIXxjYU!mW(9F@^2jyz~r%5d-7mKuXk`0<@!3>bMzw+23`3@ z6Lic4kDwEvFd68PwS42}X2>AG+wqrJjeCL{8tF>70~qRceNp{*grA3u zt>Tla5-^d&@i66);SzoxRo`Efb~ez)ptr=*WdPv}?+}Zdz}$?((p!L7B6)w$jW{F6 z0&)=$-LjS*BKN3~CWpQxjd_SAk`bjkcQyP7pOdPU(a9S@eor|05ggOIck~n)o8PH* z)Jx)3=T4+7OtqB$f;1=&u^d&ca8Ea?8?1fyJwqj{IUMaFO4=vXaqI(fKE;zTyEURD zc?e2KyjmAkWW_H5xvxG*tz@;DL$K2Dgeu#9BR6Jv5k)jZJOydVmsl01sMUAhZN8AS z!?4%`E4q;$o}&E6QyRMaF4_3aAi`|X!!i7TtYkB0Wj{EU!s{&)k>onxY`<1~C97Y1 z305w>pf=J~GV$^d!dU81T2K5IZ5xKgFL*=?QTj9ynmikjQp1m|CUP7nN?9-I(ySXK zai|Zjp)5rukV7fAq53z`???H!`x8N!>9`(zQ2Zly{tpI6@B0^FHwnpOSSuMlRJO-5vdnuXUZT!o{$17DXNt_-)+J+PR$R7 zimOGaTDzKn9DdSZ&p;!6Mh5 z5yv*&!3q~lm?O=QF+kp?Vel=6V`q8CiJOtJc=N9a0MT9jVyGzH)QG0x4MeO9Ci2(K z!LcEjFT0^I;Rf@f7+J6xb?JriS`Eka)OEPXBx^<6SI>yoypM=H(*{W%DsaLca7% zYkWl|K`;78GSnqx9AW0zin0TKo3Lpg=8Fu+a{26iTQJ3D;I^kDtmwM=@*;QGLOjx4 zOVc}iLgYypgcW$a*)I8|>-e+bdYnJJ%brDY4@GAN91SG@i(xwgSX} z-^FBM=eGfq5AxSSw}LPh<`vGTn$;}2Du~Q*E3rJCrM4p{;OU#$iV=?p_oMgS@Xh=A zx%;+ahl$1zKVDqTYI|f8QI2mWelggBR<4;qm~9xhxhB<0%^n`jy9W>J&5#DP*fv8o ztNYqb6s1^evHxL9Iyfa13rQPT2;IwKiJUC_`H#$$kR2^&rT<3bCZ}*~hlQN`)`##9zVOzc>OqW+TSMx1f1lCll_4 zJsk5#k`H3=Z326fAgbHd>f-@~;R>t078igWriJ+IqZagt*CfKxc7Oy3Qm+G8NMsda zGi5t;Yz0y=0T5l*jsQ`ywiVqcHmAmaB0k zS6R)HT3Bh{8Gh19QOWWIC2ORxeQEB<;&}K;3zr}wA@vEmDSpP9gID> zwYW9t50nmP7B@%HC3PYiIY(7XGn1=)k^!+YX8T=(6`eF7NK`J^iSAck5^`rL;R+nY zq&wp=U-Hd+G6c4SPu!i1x&)M%FIKMNK6%_okvr5%-S<^|3cN>Erg`EN9 zxsUlVNA0U*^HH=kB+=G*jn$+-A0aBgwiGX3JV`=ZM-thglUR0Oj37IdnVF0qO=*mJ zF~vM94LFMxUt_iUq!A*st)+OR?NR(vZ6#rrIEr@5#-jQc5Ze&^{;iF%F*OC!Hx6|P xEV;(&K79pRrrC*w6$PYn%^F1gh5S}Ci5|B_78j2Jvq)2J7Y|904le_eJ26HB3d!_YVxI5b*A>lS9;stmS zBo{nD9w0@U^eLzyi8J5cIxP4HnTnNWc6N8>o7tJ!eY?6s>-3n4G5Y%Xa7DyAe;AOe zsG}%V)pDgaUULd=y88CX>d|;S#ujWa&dzt9SOGYmy(;h5Yc=|Ldvkr;>KXICVWK7* zJ|;I=BgcMIs8XTlE9m%5(mgSpvj^lWIPk_m=bc7ZXPJ)MnXk|?Nt72cAc)0d_f@If zb}GC3o?NNkFXrjsE)CN(#FkGx`m4gP%g#Y?_nUJn9`9XDdoOlEWmB$P+EhPEb<{Lm z!f}AohhU%}ctCUkMxbG^PBa3e%a24~f`^ucaguj9T>!f*;8bjCAP!h%NtcVWOhvMG z$tckM<|{!ySxZCP+DPjN#Q2o!Jxbq%J(olz#oC73QtAyo9b%&$K(T$6QbAO z5A6(#ij|6P0sd2=XF|_~c7$FC?FqdU0tjw~Mnmn;R?<*}Fky>;fSJTh`5>XZlQL5) z>7~@teWpymqm26;6YF*-r&vQ#mUS`u;~mWm7*4l^Fd$C?_5+&^03MJM{~^kYpXuSh GNy#01&~dKaYE>#sC#7mmWNJ*O zYpG;tl9o%*bT5^FP4L5ZNHVwcIcEkO@6rhj<;I-wl~W`f-&Rk?(#SEYiv9iz;!PW z9l{VWcI!k3F!c2W(Hr+-X>#|#y@U`pxqIlYf`Co#l6RSBE(~%vJnn1bZ8SHy!h6j# z{nUMVR3Gl&w!T3I+8kQj18!@`XfiOM3H=}<+U7>Z_rgCGopx>R|U1PdC_^nq07k;j4 z>Q|hpdbQS2)uZD^gZ@q)|4rI`Yp^S?%M~o^H4za6p!FxBF$4i4y-suwK@T@X zfAnWBq9o0`oGox$*5O=iYA749a%qpJYfi!Skwd<$OZg=5quMT7wids2o3~?0vr-2L!p6> zvXX)#gb7;&1fOZbln+zNyBRaJ(|$&6GhlM&k{k~>AvT?EPKg0g)MGJP`i_KyO}`2- W!hplT<^#YExRRw$SvVi&N}d1$bO#*( literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata5_113.dta b/pandas/io/tests/data/stata5_113.dta new file mode 100644 index 0000000000000000000000000000000000000000..3615928d558388f77e427bd7444798eeb1654701 GIT binary patch literal 4628 zcmeHJJxmlq6n-lvl4z_f)R-`YNHj5*-Qy6C5QRey{6SH`A1rk5cJFrK{Mt!f&1WyM!s!U4R=z8+;ba2$t*=p1VGA9ieH9hxU+ zK9c>lzH0b6`)kgb`!0nBv3?$8g+Z75mpMn0Hi3 z0{0)818yiwTU!kQ=#i*lcK5{MiDYW&Jh7+z!u)@^2ocp#1IWDOT+o(q5iU9|!rG)w zbdjmZI&)R{X8Ax9V4d%+IFw2=EQM}}%4@=QiyLrtq;tf#F# z$Vwt?s2G`PM1a`PQCuA`<9%@y(-To_pp>4Dn;7dF3Js!mtYX!J{_&*V$h198YowHO zA(0yScsp3mOJTRTn7RK*S`yT$_CKx#I?A~V|(tQD8?jgoP8Z*7z8e_i5`=r~&XUck5UV%saKf~8Y7@iI0d<4GkhnrvQW7lIB zPyGhlvB{!rc+a`mPPiavMZiyF2 zj6aE7{7F3cm*_=L#z;JCLc-Al2NHp+L`_5l)|uHESXx>%CPpZev~S*f-+Mdr?Rz`B zN(l!_!8Txyg_px%4*fi@hKV6*FI)0q1NNtXT;t{fV=1ezv(jn9zR~Az#?4F!3whWg zDA4WVz0l=5;H?Hf=NA@@#cy1oUz08X7tf$MG7{I2zW^Ybh=&Vi2t^YD%8`GFiBuqp zR*`pz0$QXd=Jf+rQzCd3c?SjL-zESRB`c#HKBp^8O}{m_fE*%Nj&*yz>QWvRusHyh zfR&fdr~QL~0wk}xDu-ovs{*o(4&O2X*}hHvj~UPf6v7n>GYkW}w}0)Z#kh&tAwQOn z?bw<-HWTJ@0ya5Zt855qe^EP4W^o=ICB-=iK5W4mIjXZ8FhO?f#^Q-& zb{yMNZey-rE&`}g@Bkq5j&gw|;v!gdTm;2Q9o0p~K4UC;v==pHSKb<+Rct_9h{aYH z;sBSKGz=YaanFHja#h+ep1P!<#+s7Z>t52^{M$$|zW3PeCnmS9XsM`EcM zj7ssaCc~tp#3Gs|i7?O)+cZHFobW$S>JClo)3`@UDHj~-k&ky8%V{g@Wd}3WpKw_Q zYHy~}Jf8tb&ijdFc2}H8b|~MpjgWI|>-23R>+jk7FrYRqllPHZpRbemgX9h~?qT2& z(OKSS-6lR$)vTg$i_*aunmjpK6!O=lr(n_-fk3J6toQ`*e)9|AWA_fhK>a5z^}#MOZL(aAv%K8Ye1A- zBMO^(8sYvm0Le@`u_i-2nUOJvcTq9}(Ij4lXHjIlNThi70}~lKS%qg|0Pk%AP{WmE zB-J6mnD8WArDIbJz$&kA(}7^nTty5|S9Qo--wd8irPkq_+E%<}OU6ETz867k+a|u9 z-@EC#Fq5AM6Z}*a*dlbq+S1U`q2TKsHU+Q(;~#`M61yWmb z6aY1z(DdHEWGbD>t{lhrRGgR_SBohAok#&7_m*ivYOF=9vlfnvT-AcTREu6U zRE)l{L5I?UwNOeeE|f7LH*4D#*5an4#b*m#FV>>pFwwx^(D29&mMt`vR=d15ZjPub z0CQLik{8Y{XDqPiVWcVCe7q&n+IHgPsnhLe&UTzT-`UlD;o_ysJ)2jHIGD4Fw0M1` zRSMJw>kic)4vC;-w2;&}77wntSa=-i(sUC#|6iacn-D*^VN8<^IfhijJP6GUM4F7C zZa@q~nhI0$u$~%7!K9i_APr_!Bc&sxDlj?%JCTfJKm5;01Mzulp5%y+X(8Yo`Sc)C z$x`6}znGc*Tv*{?{x(+gH$ZkVOg;I%X&ik_0!RIRaC@;rDHqZE-35utkt`EmI{B-#FKK8$G@WgJiYzr^S z0w0+c`>+=H?5JTprKCj<&JlKAu-!XaNd7=Aag>r4m4CJ1=LV6$lEhv)ErseuVz~;B NEtiWnH$#s2{{X{R*%$x- literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata6.csv b/pandas/io/tests/data/stata6.csv new file mode 100644 index 0000000000000..27a1dc64f530b --- /dev/null +++ b/pandas/io/tests/data/stata6.csv @@ -0,0 +1,6 @@ +byte_,int_,long_,float_,double_,date_td,string_,string_1 +0,0,0,0,0,1960-01-01,"a","a" +1,1,1,1,1,3014-12-31,"ab","b" +-1,-1,-1,-1,-1,2014-12-31,"abc","c" +100,32740,-2147483647,-1.7010000002777e+38,-2.000000000000e+307,1970-01-01,"This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted","d" +-127,-32767,2147483620,1.7010000002777e+38,8.000000000000e+307,1970-01-02,"abcdefghijklmnopqrstuvwxyz","e" diff --git a/pandas/io/tests/data/stata6.dta b/pandas/io/tests/data/stata6.dta new file mode 100644 index 0000000000000000000000000000000000000000..d8a46ea3b9696ffa4939b0bf1fb697f9b8b5833d GIT binary patch literal 3048 zcmXSBVr1Z8U}b=S<RO)OF{GB7kzFtD^TF=zPw=kLG&|GqFLRhFd2GZ+|xv|Q=SX zGe}1j!p|EP8i1ipnx~CSOp5V`3#zQDWie?splTn%yst~a85}hP>Jx!{P&t4T{Qv(S zB**tT6kIMm_|XY2SC%R>j2Z|Dw9zocl6Y_@8!U=pLgXh#cSz#2I}1*n9SoOTthN4ixnX45rvGzVg+CY zs*s$KSd^Gtl3G-(qfnf$P?C{YqL5jn0906#k*biJSdp1qnyZkLnwMUZp-_-ol$%*n zlA5BBRH+bLl30?cr!dG3O=)Coc=G>$JuoclAYpN*o*^N{6__qkkn>1kQE^FWS$Rce R6*QB8(-OAIYv|^YQ~=?gqTK)h literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata6_113.dta b/pandas/io/tests/data/stata6_113.dta new file mode 100644 index 0000000000000000000000000000000000000000..2e4795b167f266cf86afd73033bc1a43ae9afc5e GIT binary patch literal 2752 zcmXS9Vr1Z8U}b=S1A8YsF#?%ZsWs&c3Wf^4iA4%V28Jez%uoeT>cf$VV08@t|NjRG zf%JgDf~Bh5UB>f5omR2Sf48Q;U{rCUh7sjN@lGJzx14FPW2~Zx7f}H%k zbi5{{<>V&<&A<&(@=KF)fUdv^QWAlVEJ?vB&rn=alnHVLP5@Pa+nWpwYD}oQRW0=l z(xE~mvr`xldW}p>is9xIGr-uWwvAxk*QG32e4hvugQaCG(hRWl&i6Ug0i^E1k4|tt zSgOo0Y9KVgM#B&@PP?<<#M!}c3Y^v%p?p{(egNe&pG*g< z{{)kVx&S8M@WTo2CcK#gs%r1!La=!pbMwK~28@Okb&W{o!3~Bo5*Zm8Aa*p^Lxeuq zJE-mPNK9g2NKH%6$jr*l$<50zK-UNJ6J`hx-LOakrWz1nU})G64U7E@ALh6vCMA!! zut-jMQV&cQ4ay(>|1&OGVj<5ELH$kdkV=JiA9OYC8KPJJT!HB#1v!rt78RG2mX%jjRzY(MI4vSl}3&VdPD7`tI;j1s#T!}4{8B2iT%}=H?lmoHk zWTVysf7Y_?{ERX=Kz$eaw=e_%_%szPpJ zMP_bku0l>~UV2G}LP2U#Ze~eIYKlTqr9yB?Vo9Q&!XP&^rIE4W$^ZZLz_6%;gvFhD ihJ+MXV7f>_&Lf3I#U-U>Zp6cc4dZ z9C!c_XOQ{^Jwz%G5S1T++QzKyH8vFvqNh@r%lwaLKh5r#bt%vXy%15qu9Zuw*$z)W|04R8&iJkx7(@COOUY1LLk4bdgCiAp4jA!y&v$g`wg| zPEXXkrT-#9pnShWj}{8-Zfuf>p(4BA7n0ORE}d=n+=2iRw%~opB;WS~-@wUy_>MF$ zHm8F4?Ll}dC_#I|Py&%+*<+Im`cwRSAZMcC9RC^)X9JGHSy!YEvYV#VyASm+{zvSW zjl_QMB3XtVFNe(A|Gpa5>-*w#2d0?EE1{mMCsVE&Q51;2;Y~!w*2LuS-Yii8v?bjz zm)~1<*;%15E_;s5L2eL^ri&tw8yebV&9o~vT3T7>2CitXZE~(r zRI5102`3}?j~AtN#XI)>n!w_7gvGZSEYfK)7c_lGR$ay`A3m;C*X1P%XX&=(^>FWq Fb_Muvm<#{_ literal 0 HcmV?d00001 diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 0b8cb2be2371d..309a0a7b7e256 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -29,9 +29,18 @@ def setUp(self): self.dirpath = tm.get_data_path() self.dta1 = os.path.join(self.dirpath, 'stata1.dta') self.dta2 = os.path.join(self.dirpath, 'stata2.dta') + self.dta2_113 = os.path.join(self.dirpath, 'stata2_113.dta') + self.dta2_114 = os.path.join(self.dirpath, 'stata2_114.dta') + self.dta2_115 = os.path.join(self.dirpath, 'stata2_115.dta') self.dta3 = os.path.join(self.dirpath, 'stata3.dta') + self.dta3_113 = os.path.join(self.dirpath, 'stata3_113.dta') + self.dta3_114 = os.path.join(self.dirpath, 'stata3_114.dta') + self.dta3_115 = os.path.join(self.dirpath, 'stata3_115.dta') self.csv3 = os.path.join(self.dirpath, 'stata3.csv') self.dta4 = os.path.join(self.dirpath, 'stata4.dta') + self.dta4_113 = os.path.join(self.dirpath, 'stata4_113.dta') + self.dta4_114 = os.path.join(self.dirpath, 'stata4_114.dta') + self.dta4_115 = os.path.join(self.dirpath, 'stata4_115.dta') self.dta7 = os.path.join(self.dirpath, 'cancer.dta') self.csv7 = os.path.join(self.dirpath, 'cancer.csv') self.dta8 = os.path.join(self.dirpath, 'tbl19-3.dta') @@ -45,7 +54,13 @@ def setUp(self): self.dta4_13 = os.path.join(self.dirpath, 'stata4_v13.dta') self.csv14 = os.path.join(self.dirpath, 'stata5.csv') self.dta14 = os.path.join(self.dirpath, 'stata5.dta') - self.dta14_10 = os.path.join(self.dirpath, 'stata5_v10.dta') + self.dta14_113 = os.path.join(self.dirpath, 'stata5_113.dta') + self.dta14_114 = os.path.join(self.dirpath, 'stata5_114.dta') + self.dta14_115 = os.path.join(self.dirpath, 'stata5_115.dta') + self.csv15 = os.path.join(self.dirpath, 'stata6.csv') + self.dta15_113 = os.path.join(self.dirpath, 'stata6_113.dta') + self.dta15_114 = os.path.join(self.dirpath, 'stata6_114.dta') + self.dta15_115 = os.path.join(self.dirpath, 'stata6_115.dta') def read_dta(self, file): return read_stata(file, convert_dates=True) @@ -112,20 +127,30 @@ def test_read_dta2(self): 'monthly_date', 'quarterly_date', 'half_yearly_date', 'yearly_date'] ) + expected['yearly_date'] = expected['yearly_date'].astype('O') with warnings.catch_warnings(record=True) as w: parsed = self.read_dta(self.dta2) parsed_13 = self.read_dta(self.dta2_13) + # parsed_113 = self.read_dta(self.dta2_113) + parsed_114 = self.read_dta(self.dta2_114) # Redundant + parsed_115 = self.read_dta(self.dta2_115) np.testing.assert_equal( len(w), 1) # should get a warning for that format. # buggy test because of the NaT comparison on certain platforms - # - #tm.assert_frame_equal(parsed, expected) - #tm.assert_frame_equal(parsed_13, expected) + tm.assert_frame_equal(parsed, expected) + # Format 113 test fails since it does not support tc and tC formats + #tm.assert_frame_equal(parsed_113, expected) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_115, expected) + tm.assert_frame_equal(parsed_13, expected) def test_read_dta3(self): parsed = self.read_dta(self.dta3) + parsed_113 = self.read_dta(self.dta3_113) + parsed_114 = self.read_dta(self.dta3_114) + parsed_115 = self.read_dta(self.dta3_115) parsed_13 = self.read_dta(self.dta3_13) # match stata here @@ -135,10 +160,16 @@ def test_read_dta3(self): expected['quarter'] = expected['quarter'].astype(np.int8) tm.assert_frame_equal(parsed, expected) + tm.assert_frame_equal(parsed, parsed_113) + tm.assert_frame_equal(parsed, parsed_114) + tm.assert_frame_equal(parsed, parsed_115) tm.assert_frame_equal(parsed_13, expected) def test_read_dta4(self): parsed = self.read_dta(self.dta4) + parsed_113 = self.read_dta(self.dta4_113) + parsed_114 = self.read_dta(self.dta4_114) + parsed_115 = self.read_dta(self.dta4_115) parsed_13 = self.read_dta(self.dta4_13) expected = DataFrame.from_records( [ @@ -157,6 +188,9 @@ def test_read_dta4(self): 'labeled_with_missings', 'float_labelled']) tm.assert_frame_equal(parsed, expected) + tm.assert_frame_equal(parsed_113, expected) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_115, expected) tm.assert_frame_equal(parsed_13, expected) def test_read_write_dta5(self): @@ -302,14 +336,39 @@ def test_read_write_dta13(self): def test_read_write_reread_dat14(self): parsed = self.read_dta(self.dta14) parsed.index.name = 'index' - parsed_10 = self.read_dta(self.dta14_10) - parsed_10.index.name = 'index' - tm.assert_frame_equal(parsed_10,parsed) + parsed_113 = self.read_dta(self.dta14_113) + parsed_113.index.name = 'index' + parsed_114 = self.read_dta(self.dta14_114) + parsed_114.index.name = 'index' + parsed_115 = self.read_dta(self.dta14_115) + parsed_115.index.name = 'index' + + tm.assert_frame_equal(parsed_113, parsed) + tm.assert_frame_equal(parsed_114, parsed) + tm.assert_frame_equal(parsed_115, parsed) + with tm.ensure_clean() as path: parsed.to_stata(path, {'date_td': 'tc'}, write_index=False) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), parsed) + def test_read_write_reread_dta15(self): + expected = self.read_csv(self.csv15) + expected['byte_'] = expected['byte_'].astype(np.int8) + expected['int_'] = expected['int_'].astype(np.int16) + expected['long_'] = expected['long_'].astype(np.int32) + expected['float_'] = expected['float_'].astype(np.float32) + expected['double_'] = expected['double_'].astype(np.float64) + expected['date_td'] = expected['date_td'].apply(datetime.strptime, args=('%Y-%m-%d',)) + + parsed_113 = self.read_dta(self.dta15_113) + parsed_114 = self.read_dta(self.dta15_114) + parsed_115 = self.read_dta(self.dta15_115) + + tm.assert_frame_equal(expected, parsed_114) + tm.assert_frame_equal(parsed_113, parsed_114) + tm.assert_frame_equal(parsed_114, parsed_115) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From 38ddd9136eae9090675e6ef2c4cbfd75270899aa Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Sun, 2 Mar 2014 08:39:20 +0000 Subject: [PATCH 51/64] Added expected result to test Renamed Stata data files to include file format --- .../tests/data/{stata1.dta => stata1_114.dta} | Bin .../data/{stata1_v13.dta => stata1_117.dta} | Bin pandas/io/tests/data/stata2.dta | Bin 1786 -> 0 bytes pandas/io/tests/data/stata2_114.dta | Bin 1786 -> 1786 bytes .../data/{stata2_v13.dta => stata2_117.dta} | Bin pandas/io/tests/data/stata3.dta | Bin 13255 -> 0 bytes pandas/io/tests/data/stata3_114.dta | Bin 13255 -> 13255 bytes .../data/{stata3_v13.dta => stata3_117.dta} | Bin pandas/io/tests/data/stata4.dta | Bin 1713 -> 0 bytes pandas/io/tests/data/stata4_114.dta | Bin 1713 -> 1713 bytes .../data/{stata4_v13.dta => stata4_117.dta} | Bin pandas/io/tests/data/stata5.csv | 12 +-- pandas/io/tests/data/stata5.dta | Bin 4924 -> 0 bytes pandas/io/tests/data/stata5_v10.dta | Bin 4924 -> 0 bytes pandas/io/tests/data/stata6.dta | Bin 3048 -> 0 bytes pandas/io/tests/test_stata.py | 89 ++++++++++-------- 16 files changed, 54 insertions(+), 47 deletions(-) rename pandas/io/tests/data/{stata1.dta => stata1_114.dta} (100%) rename pandas/io/tests/data/{stata1_v13.dta => stata1_117.dta} (100%) delete mode 100644 pandas/io/tests/data/stata2.dta rename pandas/io/tests/data/{stata2_v13.dta => stata2_117.dta} (100%) delete mode 100644 pandas/io/tests/data/stata3.dta rename pandas/io/tests/data/{stata3_v13.dta => stata3_117.dta} (100%) delete mode 100644 pandas/io/tests/data/stata4.dta rename pandas/io/tests/data/{stata4_v13.dta => stata4_117.dta} (100%) delete mode 100644 pandas/io/tests/data/stata5.dta delete mode 100644 pandas/io/tests/data/stata5_v10.dta delete mode 100644 pandas/io/tests/data/stata6.dta diff --git a/pandas/io/tests/data/stata1.dta b/pandas/io/tests/data/stata1_114.dta similarity index 100% rename from pandas/io/tests/data/stata1.dta rename to pandas/io/tests/data/stata1_114.dta diff --git a/pandas/io/tests/data/stata1_v13.dta b/pandas/io/tests/data/stata1_117.dta similarity index 100% rename from pandas/io/tests/data/stata1_v13.dta rename to pandas/io/tests/data/stata1_117.dta diff --git a/pandas/io/tests/data/stata2.dta b/pandas/io/tests/data/stata2.dta deleted file mode 100644 index c60cf480ad5dd82db28475872f08a5280e7a80ed..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1786 zcmXS7Vr1Z8U}k`TuZEwM#2Zf-J2W&hG6IDfv<-ps3=9Vt6HsUdi{;E9@ed#O$N#Tq zVAxmxX)=h0=`ZV<+WFtmSiv{3NWsVe$S^dwGB#oO|NqaQKfizfNl7e8Ey>JHjZcPH zi$TH_BxR;!RRmIip$l1{JT)~tr!pS5y4?J{k_>z*3QH4Yn(E6GUANsF&cO~k1L zpB%DVk=Uvw$qWYRNCJeIswK{(YDi%q#R3M^l5$ctvBq-v-nO#`7|AC(^s zfzc44dkARUeA`~?#L%$t)f*t4<*vyU!DN+kiyh>U8KT(-LG-f8W(UQ-FaLYw|GRHY VAORqF0IZuJ^wW9}Oig`S4*($EbN>JU diff --git a/pandas/io/tests/data/stata2_114.dta b/pandas/io/tests/data/stata2_114.dta index df9c19ee0a9762757d775f5793760b67a2085ebb..c60cf480ad5dd82db28475872f08a5280e7a80ed 100644 GIT binary patch delta 219 zcmeyx`-``}h>4MbgMpa=0=^o4RuXSKVeHV*$jAs3YS14MbgMpa=0;GTx2%ofN{0$PxPU>Lz^0S%o0T8FfXERjrSFzRsF<8CU z|Nl9R3=HB78-OGO0}~M2#j}>l0!dX&VbUDn?Ew`WuAC}WAZvS9{@S@L)HKQ diff --git a/pandas/io/tests/data/stata2_v13.dta b/pandas/io/tests/data/stata2_117.dta similarity index 100% rename from pandas/io/tests/data/stata2_v13.dta rename to pandas/io/tests/data/stata2_117.dta diff --git a/pandas/io/tests/data/stata3.dta b/pandas/io/tests/data/stata3.dta deleted file mode 100644 index 265fbcc3a8187d355a598101472f23564415b4ab..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 13255 zcmeHtX;c(fyLI&dg@}lX11ciU2q;*hqC!`9ol_t|1I9QK#HoxkI3sTq6;K3&BSM83 zR76BYB1Az6gA)=dhajTC#3`aiMB{)H(QkL<{nom7tv9*r{=V^#t=+r#Gwf$Lr+T4Q z!`L$CnLq!8cAV4h-!K2;UzdUo8O?7*-~RLOT(Ugi!zzXm-VEf;{~u->qQ1}9OfoV9 z26r7fC9>-P4^NM-1Gu3By_u$W@Bho6$Qe@>nn8d0Ct}Hzg^OqW&1V+Qm@;?f^zgs@ z0>=D-X$$5r`pZw4KQL?l(!YFP{f3zfmM;FwPnf@9dibore4m;2cW;_u^F03=tiH2& z>a4j7r!1cFmmga)f5yDOi2}ofFZkQH8D`deY3^Tsz4?tl1NGNo_**c;^c>R9W9I+x zng90o{~qA|zh(Fb@ZTc-pa0SG|6-K>b144#-+v|J%#`OP5W5w<%8~2I(m(~nD>>i zF7v4%H9jF6_Yja4LO~bGbKD+2>WZh3bh0g<5$C| zLX^tij7+o*SQdk2Kku0}KyV4^&pX8?0HP;t`cY|6E!li%yO3r1Ops^&PENf9gs~Aq zk~V-PkH0*X6C!*z_{Kh3&1bQi{_i{~6ZA8gePgc>>2P0Q!e5a3;CiE++8p+W@sgCu zpVtf&tZUlv^}#8IELQ8&%abadZ;>T!whJ6rCvc};5=p8za-5Ce6zi%&$`1^LVJ&?* zt~*$C6T>~J?0u6k9@~U9ho0o)P8jMn%jo@S5UNy{hotL0gy75u=ImKD5zSgBb zmEvxa^S4(E8wWQCT-8f5x}d=*JG6i=t>JzJA30)>;D1{-z#qeYR^vU;gGx(pk{T{n z2zvZjVC-I#+bhfvTfsi3Js{Wl7k>-_OJ#A6pC`Yc)!IjMRIa!~zHmwsnp~dZt#`yF z}rHVVq{56J!#u-G9l_9{!y zxFPteKto8CzHzl8)Nv{BD$_*23t5erm*uZ+Z(gd+hj{Jx_hJS?%})17$8f zChueR3f`vysnXM=lfOeSt%cY}o;<_k^2g(cLT?tna;giT&Fad6j8rLlLb^v~3A?|u zB+MoY>SO&JviuWb7EJEwVaO2+jk&@wy|%(gx$e*L?Yo80w-!X+Z%LhEpBt4xdjxr~ z$}&bb97YEiz{DS5HPeF$mCn8(0XGf^xyQAHImc4#8Z%2n+$d`oOw zV;3ggf%~IqM7uSEWY-T;kYNt2*gmTSwH2H3kz$NI2ysHgu?DHCs&O(%Zs9G<{cMVX2WVpSe2OzrD z_dkVTEa=*>GGT2rQkl?}o_hHjf^mZTQ^1nW$M537ve&>cc7W)ER*j`xl?9D@aZ*Uh zWQZiTrx8AH;Y(K-7Oq0tdmB-)I^O^GPliLRMt^z|RVuA$(w3{j8Rk8jQU_%J9oj`V z6mk@=RHpH(zcZrt_ZZBAImBv1NKg7~|mcP)W0$^7P# zCe%(hvtMA*C6Q^A$+e-?Lw*qwLJ+ZsT`0%>0hV6>W=Z6SeK3J#8tTOa7QJ`r=TynC zqn!*lg<{r<$j;qh*axtr|Gafg)O8M+CXfaG2HJd%c*-Q##O518xYhwT?uo zcLj@uNp=_j!9=JK@u&#+Of=YjzK2=e;Kxg;?BYPZW9|s1u&zYj><;%WO>(Li1iw** z?EKgd8k@oUSdRmXKK9~r%5~~U%Vysf-rB;jtC%;E6(VMCcWG*nuV3&3%MJ{SA-=h+ z#_Q=9Xt0jdVR60Sd954a?qU+8HUrC02xhP9zQN01oi#}(s|~rxi_>bV4D3qB=o$t8 zcxS>~8wf~q6BF%=kL5ziVSG`(KU7`7yM!ba=d!x)+BK9J(w$aVHV7fTyAoxf7c|zw z#4sjw%S;vG6EFgy*o=tv0*ik97C>Hir^TzD3R(S}Fu?}FeOnWkCqiQ<;L8L)>EsBs zj!kHN2Mph`8YMl3N*8+4`>qTbJE{wjj$#sUtpM?d;7_WM!7c%aUNxGMeaW}1Hb)mr zl^eZiL*RSC=Z8*2`f>;$Z2%boDPB||L5BhmvBS{X8nEarYhx*Qpf|N{uO-9o12Sqj zAZ;O-0JwY$_Dg)w$d3^4gXsL10MWM}8cSu33oYy0D9DBFiIn4mh_#1cMqwnqP$A>5 ze*}$H@;QFH3`elT=)uw)x?0*GR2SHzDfyyXc7$Q0;PN#<;`l5nP&oDSoH}2Qu-YIK zAXRRK<1qjIntwF-PLY<$v! zNXY~cNBA-Z_MZkzCV#wQB>KP}zMxAi4AVPUz?bW;^fh-+NblQ%$iEAKI3fGv@UgGJ zQpg7`8-@P7%dD~DJXYhJyB;H{Kdp(Rm#U8ZNDhpCyXaPt7Al+bC7~D@$A%pjS4DswlUQA%oB_wU2Qm+9tgJ;Qj z7Yo9j_y~q|N06tXMI=LGrTpmeL2!RKUf%+X-tpN6OffGSVWA_9RvL(ZBsA6oRVg9+ z%T!3&Z$T*Jq2_Wk_`|&bbJEtPddi&~4k$VN5jZYLj ze1eGC8)F{@mwv};H}{aKbd}Q0FZ&ShnAZZgXDT2bCZ;qRv$76(N#pY$d;<43n+@Z? zV|Dt`sgz3)Y3RgmEoQaK2Qt%iCT%>PtgjvNE=_!$ejYy^v5ItP|UNvOBedXxBUv4bZCr0=;gOeE z81n`Z+r!Te`P96P8?pKTqFsGCmC6qZU1#Y?vf8~A<5~pUhC0ew3z>S{E|gu;n#Sq4+p`bFF3M; zDpyQ2d23s;t{w7H8U~hN6Zd>Qa`6ictK!Etj75by8-1+(N{_Lcke{}r?g{PwRin^< ze4r)QYkrr9(T|X?Hp*#WhUxn60-j#`q(*S-^wT^ zk4Ijr)Sdqr`dH#ulsgefduHm1P3yM;6Fm#QOh$=qf_k%%eZ6sXhY2X;OFSyGq=40o zYWoeEQV>m_+M48_dnGVW=OQm*CV9eExcoH?Yvf(mO@Lok@WqoaV6|KQ(&5W!TAkj3 zIFMHYcNePzGaaeihL2^z{pWn6)kKK@y!l`S7Ts36T^JT)==g^nh|QKpfomQCUuL3S zb|Cw^RFy!oji=wZ9b+d&+k@JPGw;10~0n7P4BKDf_5A zYZA3=){iuf|3grUmLOt_kjiunyIt_*F5lnB+y?h!nk^|VWObpFv#GrLGx|PeFo{s! z3(DL`L~JR7ya$FIN04vu^X`YJ`wY%zhCU2wdTIIHn_1W2E$)O5>>c=q{2;B?W5*eb|!Kib6)aKC~-@`oAnOikr+ zR=Y9dTPnSmsO@JaGH(aEWz=f8zZQM$5Smhr+AU9Q4~3Ckj83s1l^$nx($qW@@=V$g z;!lcfv_x6I2JXk9#B$ZaXYR>ibTW+WXsp@80HQB1`wsOoi#pT?5}N@!Oysp-iN_9; z2aVaGtcv)EpC=>8dLuf&;RLHmX?c`#x;a?j14;Z89pOI2fMp}{Qh*X`r$U^=!!VMJ zsQtdBCvX-Uc9b%=W>cTpfh0+b`C=CXmMzd&5eA<k{tfpvB0lsw}4fGR82-}Rv#yF&M7a%1ls}OX{3VvtH zX%NgTyq-`}gnjI%e7OHPZOrl{{*g9B2{XID7w(^dFHK10L%z0f8p_QA9j2zZh}Auf z$)`;8Y`U&X50bmZn#kYBf^|Q7awWE-o^ZdBFPl9b_0rYcpM8s1eNOE$%B0Vu_7?_{ z(N@ig61^Uf9Hdf#VRPY2J%97bbYy?1xsUPR!75af17XOGVj zv0iZh2SYKdYaD%ya-X0Pz35AF#*<j_mx7`9r1qxY%`bdKIPyUDoELlV@Z|O;t>0bl#qe-bZ zL%ORPt9>&UJ#8^>9!sUzEwe9C#Uq|>ELnhA=}MSiccNaZkjm!pMT>|%NIF=qqw}{X zjeUN_tgPc3Ixj%Pd~gzv1&eNsPSPdeQZw3sz4)?o~*hiODh+T(p_>x%4as7=a;mf!i zRF2(3y;ja8Vb#M3(>W7{)tb1iy#Xmx4a>?5heU@VmHuGS$;*DBa@+0H#eN3y|J_8S zx0ztMVUi2o0okvz^b{jdq1_=E4?y%cH{7J$yX~~of*Iu5ZwA8H?}IPDppXX-J(kAcA6bAmt1UVBwVK~u-rDuZ@h_9Y@Rw^*YXSD%QhIc z91yL&+g&Q1+<|Mgxn%SA0%80!AxQ9EXR2r+B zN0vMn(E0bn{W_Ck;{(Ias^ist;Udh+ICPlzfau-r9#R=wY{AM^l8jW;_Aj_5-iE`->eEH3UI~3e27a+(-`TB^( zm}qMun9GLK*kSx0s^%9SDbUuHE)l|KBPNku{C|9zE#FrTeHwS}|Ys5lg z!d0|2I!q?NwO}d4U(VzD1`w_DxtElCzKc2xS&chOf|Gb2e0hWU0`W*%6|!;MG7P>= zNag2%=z5-bMV0hDG)r1VX1}1g?#PEPZ!!1?BK8`o+|T!JwOl>EVYdWCzm>kG^1KYX zv;1E~-w2kW*fy9C*v65g$`{DXK|VigIRtYB``Bv3Syq$q^IIw%-%F#&Dw6!jk0>c< z>vasCDvTiJRn*Kfc2p$tk^o;q__M6G;>J5FiJ8=T+$wUfk3ZoqV%vz;3Q}AEd}&l6 z=bcxekA1C*|16$a)V`z4gG@SmOeCoegE3Fd+lGa}Jr5)yNx!MtPqB@H`!+c894|S` z>K|slp-kFddNXMjPOkn${uU!D&k`eO6qH;I@t@+8??%D><@Ht1XchWcAYLyroMSZ&_3tTj zWj}rPY$lnQ0Z98|M65X?HV4^Xj#Sp;nZ+vX%mG;7y#Ud=UuHzPI-7tvGw*bou)V{9@ z@oI>Mp%WmO+r{TtyRIcPu)3s>gaGIc9oPsQE!4geyg{~U*aO`Szis6EIcm=EZ zy-X*{@rUV{n`=pSA*ynEDIhJuvIetqsM=uP=dVWV8-S){;akCKzfaYR%8+mA><2N# z(Rw(Mrk;W?b^^0$9r9vSBbL#14O019each8>J|iAh;q+7y6932S!X+x$jT|SzE*gs zk^uL4HI+YaTZ2M2V(@kUzJk@aX~~N6_xUvI#{{y@au`u^&mu3aagN!Dr*HyTs`%Y6 z*T9#T22`fu2Uc@vvX!Wm9;Mc`o5_OF1BvqM8Dzf=E+oE0?T4z6GeK(+j?NIw8bGvz ze5^&La{&#RB$Kp24$D;qQfV(Rmy%E~ad7`EfAQE_Y_mb;vlt+{KF-ZWd3zyk+_r-> z4jDqYk{?FRq|3 zLzW%!5Hb<9zgP8TOq)0~SU|b^w_o|FK{F+W0FS#T8RjB>WP;MeIpZCqf zqW`=&ut<1hv(He;YQ36Ti%Or%2;Vt*Cg(os-s5+KXnDkQ8r4nw?++F>eL zU5sEaDr3&jyJg>!dHvl9cfSVN?}EITWw}djeW3yC01*IZV3@x8c3V+-dX~D2-;$k& z-H6ogI#TH*FdpkrFW;yTJVV0%Z$yJ_GAO7QtM;P&`#E}i)^{W-q%YxE>@;;wcsd#b zS?)o^GWm4t^$<)to>(pA6;?Y?bP(mT3OejVDUqkR6XxIzuyhq9F&6D29mb^dMT`H0 zm^WC3)#VQ8Br+Zf-97I#$-m@=Z@!7VIK!}5q|*EVB8|6wY`!e&r?x%?=a*@nMP>a( zTC}O0+~0~tG4Ku|)=f|X_%i3fJe`3tNj?`KC*PQ6U$wZBXp2lOU# z#C=4po4~D&Q(4US7Y{Sz(Z}BUo_e_n5N+oXPRM>WeVBZKl%K~2TZ+|z>yDlrhq?a^ z3`^t-Y&T%La70sb21M6unzN`_U8CZfE5s|bFJXpYb*KZR)Ewj_xW9>?8odE5KN-xA zJTI{Nk^Opz%J^DZKCYI0-VDCHXn@9g2(ouP*3m?j#p&J#2~64tUy2GCdv=%XwT74AQ)8T7mes+E_ei;hz^s98d=LX{L2g(1b93C z6031fbVIxNjh4NAM5;!*67B$odR<>+KLPINBVy5fa#bQGayTBQJTY9tZxtT)7p0x` zv>~X8Sh@@#oZ&ruaTAzZ@mP8b;Y%d%@3|3Y*3A0-xN|J}5ge0hWQJbvzB_I#f2dS4?t>zF| z>Yq_%+i&FNOfRB{W{9UCE&UR!!c;Z;?z_zwl6Dvtd%&U_>ES8Le>|t5yYG>W-wYzm z7Cj8ZuQ^ILV^;QqVX3^{G6_Mh^Ud*V#b0LiYcGN2(kp5sT_Y2(3?YoA4ypCT4@0(L zSp0%Vv}j77C7~zJ0i?|EBdduV$B9z*Yq~7^CP^CVgKH>DQ3>Rb%5BL0E!6u_zG;6F z1TzEIgO5sn#LoZGKv8OVLmS55A&#<<$RDvH!wkfg=XR`c*YSw9kWbca!i{1%eyh;Z z_eWM&;OQkY>F?2&;YXxin4PJE1bIR#SW;D%yWefXHcrhChDxefjo<QTp)@x;EnuNwC49xR(XX zU^Lh?2xc>Oup)keb~6myXFvptt5|K-?7%$MEhFyRLCq8M4Y8F}f2@mdYT z^wf2@$fRgR+cz(W*Zfb2Jj(__9x8Cc9%xKbeF_ zQ9Z^InZ@jshM|vTsx0Qb_y=zROFOW9S8|2biZ#BXlBgH`BN^%vGLA5FZAIAuzfIV* z5A#KaVR?MczAczyGjZF~5iGiHzP!jCwh)hW*V2p*lZZSSgRl~hH@jwo4xeqA0~#i~SE<(!r^rSV&qS`+jH_ z2hjSap!U=Gq^aAWvDQZGn!M6#R$n>5Ph{>`i|yAor1ldpEwjPA|qv1;$ zpL1**SSo#yC;lqt{>2gKV>V)ZLJOMTbqe8L*~2h@1ouC?giPm9aDIrycDgE;KN zr!eFYCUQBHoW@tw?Z8oVAY{3#u)0&Ma<@qD$ID6mqWGEzL@< z@<{>2%9!JK9V|L&K#-_huoK;{z9!`EGQt%)h{<=yW4;ua_hdBKQa)*S3i1+AYQ9*x zhWq4kBSr2|OR=}t0}^>TipZKyVuITk{3z@UATNE)k2z}Jq?nJQWg$tn#_Oym!w{AfyJ)QhR+S!uvotmHbY)hCY-nQbk_BW;i3 zmujmBv(!4MbkKr@}1jI>Yi%m9UlCi!Ju+|YI)bJ2OZ*Zyb`d`n$;QZ6X0VJNl z7Xy}exE}4uU}Ub~n^>t}WMF8ZU}U^8lZ$EcZ)P<{i_IUIRTw8LuzUsLq)n`jY@5Tl G3pD@&KOZ~* delta 283 zcmX?}emuRth>4MbkKr@}1n2@O5I$kdI1M7g$QTD?7+G?yf$$j^1ojy*Z`^Cd%Maqf5G)1;JZz8@WBTL=EIN}FSiWvn IU|FsY0BI0C9smFU diff --git a/pandas/io/tests/data/stata3_v13.dta b/pandas/io/tests/data/stata3_117.dta similarity index 100% rename from pandas/io/tests/data/stata3_v13.dta rename to pandas/io/tests/data/stata3_117.dta diff --git a/pandas/io/tests/data/stata4.dta b/pandas/io/tests/data/stata4.dta deleted file mode 100644 index c5d7de8b4229543187a83a73fe39a23be00306da..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1713 zcmd5+y>1gh5dIPfe+7!5p#W)x6p02azDOvf$z4N1!7Es27~3nY?;_vXh6=Z-sHovl z+#?#EAPr9eM9g=$C+^}ObOjPCeX~2e^S3j*lkM(c4|mWU-@bm?;m)JxHt@0BueO(7 zuHtX{#itK1f#8{WA$=1Zc8^T=_#`;OkIP>_F9%cS-t}DE7bkDvr^*25hCqN)t}bAc z*Xj7oNiH=*bpda@H@EqyH?~=3lVMgD7`Qaf%NQ8M_Hl6BIUH^mX`UWDZ>!J^+G+gz zkXVGC>pMt6n-?VaSAjoO+F-#KX2WUS#j5vmCo;ubF}~hr7Ef&4chbSDYhHpg5r)@+ z3lb2Ea1U6L;QM>vb9kd5QIh5(nQSx*L@f=j#AfE|Kung7r98__th`S~r-*J_gli1y zE3fo51#T)?2h~OkgT<%NpJL8Zo8+}*wlknbFn@P4FyZE?!xM%F~|10qlLTUi pqa2B)pt2{bwDuoe37AgrDLCo@*8^J(fF78VwO{#)hyNxezX9{t7M%b9 diff --git a/pandas/io/tests/data/stata4_114.dta b/pandas/io/tests/data/stata4_114.dta index e5d4e8ca02f045b05ad43700453d520745496fdd..c5d7de8b4229543187a83a73fe39a23be00306da 100644 GIT binary patch delta 287 zcmdnUyOFoPn2C{rm4S-^6{(Fcl&W6FvY{2jjCt8T0$S7#NJq6x>pi z6pRcEjTHmMk zRzGH-F_X(!j3)CjD^0Fs(Vx75S!8lCv)trP7I`4s2uw2q4Mbm4S-^0yKaW2%j)!oCXnLWQ+qcj4Zjzq7~Rag|e>@ikf95O(C0B_Y}xBvhE diff --git a/pandas/io/tests/data/stata4_v13.dta b/pandas/io/tests/data/stata4_117.dta similarity index 100% rename from pandas/io/tests/data/stata4_v13.dta rename to pandas/io/tests/data/stata4_117.dta diff --git a/pandas/io/tests/data/stata5.csv b/pandas/io/tests/data/stata5.csv index cc597582472e3..218380c99593c 100644 --- a/pandas/io/tests/data/stata5.csv +++ b/pandas/io/tests/data/stata5.csv @@ -2,12 +2,12 @@ byte_,int_,long_,float_,double_,date_td,string_,string_1 0,0,0,0,0,,"a","a" 1,1,1,1,1,,"ab","b" -1,-1,-1,-1,-1,,"abc","c" -100,32740,-2.15e+09,-1.70e+38,-2.0e+307,01jan1970,"abcdefghijklmnop","d" --127,-32767,2.15e+09,1.70e+38,8.0e+307,02jan1970,"abcdefghijklmnopqrstuvwxyz","e" -,0,,,,01jan2014,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","f" -0,,,,,01jan2114,"1234567890","1" -,,0,,,31dec2014,"This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted","2" -.a,.a,.a,.a,.a,29feb2012,"!","A" +100,32740,-2147483647,-1.70100000027769e+38,-2.0000000000000e+307,1970-01-01,"abcdefghijklmnop","d" +-127,-32767,2147483620,1.70100000027769e+38,8.0000000000000e+307,1970-01-02,"abcdefghijklmnopqrstuvwxyz","e" +,0,,,,2014-01-01,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","f" +0,,,,,2114-01-01,"1234567890","1" +,,0,,,2014-12-31,"This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted","2" +.a,.a,.a,.a,.a,2012-02-29,"!","A" .z,.z,.z,.z,.z,,"&","Z" ,,,0,,,"1.23","!" ,,,,0,,"10jan1970","." diff --git a/pandas/io/tests/data/stata5.dta b/pandas/io/tests/data/stata5.dta deleted file mode 100644 index 4ee2ca902e757a48a0da219ebc6b70b13e1ff0d7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4924 zcmeHK%TE(Q7@q|bIT%kK)C)rhL=)3=mm-!Bg(3ytDBuH0)@^s&U1+yucX(;ymK!9- zN8-sx;=$L!Kfr^;vnD1UJ$N7yxI)w@iO4!LI}w0xR!uYSd2hXYHk$4kCmUR?&?<|F@ZE0ghg-C4O+uQ*j~aRlQYvLBcw7p zypjzWzKZ`v)30T|`2B`rlF z_NLJI9O|O>)@JaH4R&|&+qS<8!2f`Mj%NTdEDoBZU>mu#ijwQP#J2NGUn}vOU7n5+ z{-n0klMlb6@)USWYh7S6VB=<)T=^zV{XcU%35to^3+&P)x{~W54wz;>4bym44|A17 z=A2&%{1PU%wWJ6Fo1<{Vvn>(sC$Tx`)L_2V7Z?#js_g zkww+&;%%|^qaE?iu4BhfoIG{FTU%ptRz`U~#iMX4duc(CbEGzCycjGl|E9~bNGn1bRYZOf1#W;Th^&^RkkmjFLBgWuHx~Td bAP_iWcu+CLXj7O-KH|0IQ_*fY?2*tf*7&hK diff --git a/pandas/io/tests/data/stata5_v10.dta b/pandas/io/tests/data/stata5_v10.dta deleted file mode 100644 index 76de9a40090a7b43e6e4cdeba156bc8cb3e36bbc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4924 zcmeHKO-vI(6n+c-a6r9yP-DUnQi2I-+ERXMh(dvaii+~1;z76F?GN;ic99>6TcU}O z7{%zRYU07agEtdmqQryIM2(4Fj0X-R0#_R~5hJqB%+5kv+Um)emX~ziyf@!_JM(4U z%run8K>-wkDvdwRb6eg-VGQeS7oJ1i&THJ0ktjw0JW>>epr$WIDOkdo5Ayr|*76O| zRlc6)C>yTaM1CAbOqgnrO|&DE!DvRts@-M-{9OFCq%3{o>_d`x3JgY?CspI|OL#m3 zz#H(pGG=gk13b==UW7ou-HT_DRsi@s1;>OV6u zI5f)7!V z-Py~96XxVqTs%J6Y&2DvE32w&EC$FKE{x(F+YfR3B6ic(;r4`)`u_#F`7pu{Zsc(D zA>JX0p>R102ax3ECFBVq43yj=^6`TnU%wA|MZZgOqo5e_c_c{`kbMZXOT5JE(SM%Q z%^1_&-%sb^ukpK9$=zLGyh@2!pV zSE+8eQ8GXPb%SpD9+~)jiM|hlTh+MR_B+(p@;>NM@sXllnpeXk_MNfI;}lN^;~EWL z`(4eit)uL|iYIoxtjKs$GQ4A4?7&>$v!j&d$t5njagDI^f~{WV!l*Y@5JxU?QM|&1 ec5V;~WTT;n_ZuzM24cmDwzuM3w3`hzqW=vqeAQ3@ diff --git a/pandas/io/tests/data/stata6.dta b/pandas/io/tests/data/stata6.dta deleted file mode 100644 index d8a46ea3b9696ffa4939b0bf1fb697f9b8b5833d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3048 zcmXSBVr1Z8U}b=S<RO)OF{GB7kzFtD^TF=zPw=kLG&|GqFLRhFd2GZ+|xv|Q=SX zGe}1j!p|EP8i1ipnx~CSOp5V`3#zQDWie?splTn%yst~a85}hP>Jx!{P&t4T{Qv(S zB**tT6kIMm_|XY2SC%R>j2Z|Dw9zocl6Y_@8!U=pLgXh#cSz#2I}1*n9SoOTthN4ixnX45rvGzVg+CY zs*s$KSd^Gtl3G-(qfnf$P?C{YqL5jn0906#k*biJSdp1qnyZkLnwMUZp-_-ol$%*n zlA5BBRH+bLl30?cr!dG3O=)Coc=G>$JuoclAYpN*o*^N{6__qkkn>1kQE^FWS$Rce R6*QB8(-OAIYv|^YQ~=?gqTK)h diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 309a0a7b7e256..545a99522b3d1 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -27,36 +27,42 @@ def setUp(self): # Unit test datasets for dta7 - dta9 (old stata formats 104, 105 and 107) can be downloaded from: # http://stata-press.com/data/glmext.html self.dirpath = tm.get_data_path() - self.dta1 = os.path.join(self.dirpath, 'stata1.dta') - self.dta2 = os.path.join(self.dirpath, 'stata2.dta') + self.dta1_114 = os.path.join(self.dirpath, 'stata1_114.dta') + self.dta1_117 = os.path.join(self.dirpath, 'stata1_117.dta') + self.dta2_113 = os.path.join(self.dirpath, 'stata2_113.dta') self.dta2_114 = os.path.join(self.dirpath, 'stata2_114.dta') self.dta2_115 = os.path.join(self.dirpath, 'stata2_115.dta') - self.dta3 = os.path.join(self.dirpath, 'stata3.dta') + self.dta2_117 = os.path.join(self.dirpath, 'stata2_117.dta') + self.dta3_113 = os.path.join(self.dirpath, 'stata3_113.dta') self.dta3_114 = os.path.join(self.dirpath, 'stata3_114.dta') self.dta3_115 = os.path.join(self.dirpath, 'stata3_115.dta') + self.dta3_117 = os.path.join(self.dirpath, 'stata3_117.dta') self.csv3 = os.path.join(self.dirpath, 'stata3.csv') - self.dta4 = os.path.join(self.dirpath, 'stata4.dta') + self.dta4_113 = os.path.join(self.dirpath, 'stata4_113.dta') self.dta4_114 = os.path.join(self.dirpath, 'stata4_114.dta') self.dta4_115 = os.path.join(self.dirpath, 'stata4_115.dta') + self.dta4_117 = os.path.join(self.dirpath, 'stata4_117.dta') + self.dta7 = os.path.join(self.dirpath, 'cancer.dta') self.csv7 = os.path.join(self.dirpath, 'cancer.csv') + self.dta8 = os.path.join(self.dirpath, 'tbl19-3.dta') + self.csv8 = os.path.join(self.dirpath, 'tbl19-3.csv') + self.dta9 = os.path.join(self.dirpath, 'lbw.dta') self.csv9 = os.path.join(self.dirpath, 'lbw.csv') + self.dta_encoding = os.path.join(self.dirpath, 'stata1_encoding.dta') - self.dta1_13 = os.path.join(self.dirpath, 'stata1_v13.dta') - self.dta2_13 = os.path.join(self.dirpath, 'stata2_v13.dta') - self.dta3_13 = os.path.join(self.dirpath, 'stata3_v13.dta') - self.dta4_13 = os.path.join(self.dirpath, 'stata4_v13.dta') + self.csv14 = os.path.join(self.dirpath, 'stata5.csv') - self.dta14 = os.path.join(self.dirpath, 'stata5.dta') self.dta14_113 = os.path.join(self.dirpath, 'stata5_113.dta') self.dta14_114 = os.path.join(self.dirpath, 'stata5_114.dta') self.dta14_115 = os.path.join(self.dirpath, 'stata5_115.dta') + self.csv15 = os.path.join(self.dirpath, 'stata6.csv') self.dta15_113 = os.path.join(self.dirpath, 'stata6_113.dta') self.dta15_114 = os.path.join(self.dirpath, 'stata6_114.dta') @@ -69,10 +75,10 @@ def read_csv(self, file): return read_csv(file, parse_dates=True) def test_read_dta1(self): - reader = StataReader(self.dta1) - parsed = reader.data() - reader_13 = StataReader(self.dta1_13) - parsed_13 = reader_13.data() + reader_114 = StataReader(self.dta1_114) + parsed_114 = reader_114.data() + reader_117 = StataReader(self.dta1_117) + parsed_117 = reader_117.data() # Pandas uses np.nan as missing value. # Thus, all columns will be of type float, regardless of their name. expected = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], @@ -83,8 +89,8 @@ def test_read_dta1(self): # the casting doesn't fail so need to match stata here expected['float_miss'] = expected['float_miss'].astype(np.float32) - tm.assert_frame_equal(parsed, expected) - tm.assert_frame_equal(parsed_13, expected) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_117, expected) def test_read_dta2(self): if LooseVersion(sys.version) < '2.7': @@ -130,28 +136,27 @@ def test_read_dta2(self): expected['yearly_date'] = expected['yearly_date'].astype('O') with warnings.catch_warnings(record=True) as w: - parsed = self.read_dta(self.dta2) - parsed_13 = self.read_dta(self.dta2_13) - # parsed_113 = self.read_dta(self.dta2_113) - parsed_114 = self.read_dta(self.dta2_114) # Redundant + parsed_114 = self.read_dta(self.dta2_114) parsed_115 = self.read_dta(self.dta2_115) + parsed_117 = self.read_dta(self.dta2_117) + # 113 is buggy due ot limits date format support in Stata + # parsed_113 = self.read_dta(self.dta2_113) + np.testing.assert_equal( len(w), 1) # should get a warning for that format. # buggy test because of the NaT comparison on certain platforms - tm.assert_frame_equal(parsed, expected) # Format 113 test fails since it does not support tc and tC formats - #tm.assert_frame_equal(parsed_113, expected) + # tm.assert_frame_equal(parsed_113, expected) tm.assert_frame_equal(parsed_114, expected) tm.assert_frame_equal(parsed_115, expected) - tm.assert_frame_equal(parsed_13, expected) + tm.assert_frame_equal(parsed_117, expected) def test_read_dta3(self): - parsed = self.read_dta(self.dta3) parsed_113 = self.read_dta(self.dta3_113) parsed_114 = self.read_dta(self.dta3_114) parsed_115 = self.read_dta(self.dta3_115) - parsed_13 = self.read_dta(self.dta3_13) + parsed_117 = self.read_dta(self.dta3_117) # match stata here expected = self.read_csv(self.csv3) @@ -159,18 +164,17 @@ def test_read_dta3(self): expected['year'] = expected['year'].astype(np.int16) expected['quarter'] = expected['quarter'].astype(np.int8) - tm.assert_frame_equal(parsed, expected) - tm.assert_frame_equal(parsed, parsed_113) - tm.assert_frame_equal(parsed, parsed_114) - tm.assert_frame_equal(parsed, parsed_115) - tm.assert_frame_equal(parsed_13, expected) + tm.assert_frame_equal(parsed_113, expected) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_115, expected) + tm.assert_frame_equal(parsed_117, expected) def test_read_dta4(self): - parsed = self.read_dta(self.dta4) parsed_113 = self.read_dta(self.dta4_113) parsed_114 = self.read_dta(self.dta4_114) parsed_115 = self.read_dta(self.dta4_115) - parsed_13 = self.read_dta(self.dta4_13) + parsed_117 = self.read_dta(self.dta4_117) + expected = DataFrame.from_records( [ ["one", "ten", "one", "one", "one"], @@ -187,11 +191,10 @@ def test_read_dta4(self): columns=['fully_labeled', 'fully_labeled2', 'incompletely_labeled', 'labeled_with_missings', 'float_labelled']) - tm.assert_frame_equal(parsed, expected) tm.assert_frame_equal(parsed_113, expected) tm.assert_frame_equal(parsed_114, expected) tm.assert_frame_equal(parsed_115, expected) - tm.assert_frame_equal(parsed_13, expected) + tm.assert_frame_equal(parsed_117, expected) def test_read_write_dta5(self): # skip_if_not_little_endian() @@ -333,9 +336,14 @@ def test_read_write_dta13(self): tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) - def test_read_write_reread_dat14(self): - parsed = self.read_dta(self.dta14) - parsed.index.name = 'index' + def test_read_write_reread_dta14(self): + expected = self.read_csv(self.csv14) + cols = ['byte_','int_', 'long_','float_','double_'] + for col in cols: + expected[col] = expected[col].convert_objects(convert_numeric=True) + expected['float_'] = expected['float_'].astype(np.float32) + expected['date_td'] = pd.to_datetime(expected['date_td'],coerce=True) + parsed_113 = self.read_dta(self.dta14_113) parsed_113.index.name = 'index' parsed_114 = self.read_dta(self.dta14_114) @@ -343,14 +351,13 @@ def test_read_write_reread_dat14(self): parsed_115 = self.read_dta(self.dta14_115) parsed_115.index.name = 'index' - tm.assert_frame_equal(parsed_113, parsed) - tm.assert_frame_equal(parsed_114, parsed) - tm.assert_frame_equal(parsed_115, parsed) + tm.assert_frame_equal(parsed_114, parsed_113) + tm.assert_frame_equal(parsed_114, parsed_115) with tm.ensure_clean() as path: - parsed.to_stata(path, {'date_td': 'tc'}, write_index=False) + parsed_114.to_stata(path, {'date_td': 'td'}, write_index=False) written_and_read_again = self.read_dta(path) - tm.assert_frame_equal(written_and_read_again.set_index('index'), parsed) + tm.assert_frame_equal(written_and_read_again.set_index('index'), parsed_114) def test_read_write_reread_dta15(self): expected = self.read_csv(self.csv15) From 530311cb81b7985b6cd49c5a067f732dbc1cdfd6 Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 19:15:00 +0000 Subject: [PATCH 52/64] BUG: Changes types used in packing structs Started work on correcting incorrect types in struct unpacking --- pandas/io/stata.py | 2 +- pandas/io/tests/test_stata.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 55bcbd76c2248..cd5c443b3d6a4 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -258,7 +258,7 @@ def __init__(self, encoding): (65530, np.int16) ] ) - self.TYPE_MAP = lrange(251) + list('bhlfd') + self.TYPE_MAP = lrange(251) + list('hlqfd') self.TYPE_MAP_XML = \ dict( [ diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 1640bee7a9929..db268d20af048 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -272,6 +272,21 @@ def test_read_write_dta12(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) + + def test_read_write_dta13(self): + s1 = Series(2**9,dtype=np.int16) + s2 = Series(2**17,dtype=np.int32) + s3 = Series(2**33,dtype=np.int64) + original = DataFrame({'short':s1,'int':s2,'long':s3}) + original.index.name = 'index' + + with tm.ensure_clean() as path: + original.to_stata(path) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), + original) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From 0f9ff846c76714f6014a323049948c674b954fbc Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Wed, 26 Feb 2014 13:52:03 +0000 Subject: [PATCH 53/64] Corrected incorrect data type conversion between pandas and Stata Remove unnecessary, potentially precision degrading cast to Series when writing data Added function to cast columns from NumPy data types to Stata data types Corrected tests for correct Stata datatypes Fixed formatting in comparison after casting --- pandas/io/stata.py | 58 +++++++++++++++++++++++++---------- pandas/io/tests/test_stata.py | 18 ++++++++--- 2 files changed, 56 insertions(+), 20 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index cd5c443b3d6a4..720461f23def3 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -175,6 +175,26 @@ def _datetime_to_stata_elapsed(date, fmt): raise ValueError("fmt %s not understood" % fmt) +def _cast_to_stata_types(data): + for col in data: + dtype = data[col].dtype + if dtype==np.int8: + if data[col].max() > 100 or data[col].min() < -127: + data[col] = data[col].astype(np.int16) + elif dtype==np.int16: + if data[col].max() > 32740 or data[col].min() < -32767: + data[col] = data[col].astype(np.int32) + elif dtype==np.int64: + if data[col].max() <= 2147483620 and data[col].min() >= -2147483647: + data[col] = data[col].astype(np.int32) + else: + data[col] = data[col].astype(np.float64) + if data[col].max() <= 2*53 or data[col].min() >= -2**53: + from warnings import warn + warn("int64 data out of range for float64, data possibly truncated.") + + return data + class StataMissingValue(StringMixin): """ An observation's missing value. @@ -240,9 +260,9 @@ def __init__(self, encoding): dict( lzip(range(1, 245), ['a' + str(i) for i in range(1, 245)]) + [ - (251, np.int16), - (252, np.int32), - (253, np.int64), + (251, np.int8), + (252, np.int16), + (253, np.int32), (254, np.float32), (255, np.float64) ] @@ -253,12 +273,12 @@ def __init__(self, encoding): (32768, np.string_), (65526, np.float64), (65527, np.float32), - (65528, np.int64), - (65529, np.int32), - (65530, np.int16) + (65528, np.int32), + (65529, np.int16), + (65530, np.int8) ] ) - self.TYPE_MAP = lrange(251) + list('hlqfd') + self.TYPE_MAP = lrange(251) + list('bhlfd') self.TYPE_MAP_XML = \ dict( [ @@ -855,11 +875,12 @@ def _dtype_to_stata_type(dtype): See TYPE_MAP and comments for an explanation. This is also explained in the dta spec. 1 - 244 are strings of this length - 251 - chr(251) - for int8 and int16, byte - 252 - chr(252) - for int32, int - 253 - chr(253) - for int64, long - 254 - chr(254) - for float32, float - 255 - chr(255) - double, double + Pandas Stata + 251 - chr(251) - for int8 byte + 252 - chr(252) - for int16 int + 253 - chr(253) - for int32 long + 254 - chr(254) - for float32 float + 255 - chr(255) - for double double If there are dates to convert, then dtype will already have the correct type inserted. @@ -878,8 +899,10 @@ def _dtype_to_stata_type(dtype): elif dtype == np.int64: return chr(253) elif dtype == np.int32: + return chr(253) + elif dtype == np.int16: return chr(252) - elif dtype == np.int8 or dtype == np.int16: + elif dtype == np.int8: return chr(251) else: # pragma : no cover raise ValueError("Data type %s not currently understood. " @@ -970,7 +993,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True, self._file = _open_file_binary_write( fname, self._encoding or self._default_encoding ) - self.type_converters = {253: np.long, 252: int} + self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8} def _write(self, to_write): """ @@ -990,11 +1013,14 @@ def __init__(self, data): self.data = data def __iter__(self): - for i, row in data.iterrows(): - yield row + for row in data.itertuples(): + # First element is index, so remove + yield row[1:] if self._write_index: data = data.reset_index() + # Check columns for compatbaility with stata + data = _cast_to_stata_types(data) self.datarows = DataFrameRowIter(data) self.nobs, self.nvar = data.shape self.data = data diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index db268d20af048..76783147b9c32 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -128,8 +128,8 @@ def test_read_dta3(self): # match stata here expected = self.read_csv(self.csv3) expected = expected.astype(np.float32) - expected['year'] = expected['year'].astype(np.int32) - expected['quarter'] = expected['quarter'].astype(np.int16) + expected['year'] = expected['year'].astype(np.int16) + expected['quarter'] = expected['quarter'].astype(np.int8) tm.assert_frame_equal(parsed, expected) tm.assert_frame_equal(parsed_13, expected) @@ -175,6 +175,9 @@ def test_write_dta6(self): original = self.read_csv(self.csv3) original.index.name = 'index' + original.index = original.index.astype(np.int32) + original['year'] = original['year'].astype(np.int32) + original['quarter'] = original['quarter'].astype(np.int32) with tm.ensure_clean() as path: original.to_stata(path, None, False) @@ -209,6 +212,8 @@ def test_read_write_dta10(self): 'datetime']) original["object"] = Series(original["object"], dtype=object) original.index.name = 'index' + original.index = original.index.astype(np.int32) + original['integer'] = original['integer'].astype(np.int32) with tm.ensure_clean() as path: original.to_stata(path, {'datetime': 'tc'}, False) @@ -245,6 +250,7 @@ def test_read_write_dta11(self): formatted = DataFrame([(1, 2, 3, 4)], columns=['good', 'b_d', '_8number', 'astringwithmorethan32characters_']) formatted.index.name = 'index' + formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: @@ -263,6 +269,7 @@ def test_read_write_dta12(self): formatted = DataFrame([(1, 2, 3, 4)], columns=['astringwithmorethan32characters_', '_0astringwithmorethan32character', '_', '_1_']) formatted.index.name = 'index' + formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: @@ -277,14 +284,17 @@ def test_read_write_dta13(self): s1 = Series(2**9,dtype=np.int16) s2 = Series(2**17,dtype=np.int32) s3 = Series(2**33,dtype=np.int64) - original = DataFrame({'short':s1,'int':s2,'long':s3}) + original = DataFrame({'int16':s1,'int32':s2,'int64':s3}) original.index.name = 'index' + formatted = original + formatted['int64'] = formatted['int64'].astype(np.float64) + with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), - original) + formatted) From 0f36d6ba2d7b8a428bc1f9aca953dbacf0d2f858 Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 26 Feb 2014 19:01:34 +0000 Subject: [PATCH 54/64] Added docstring for new function and warning class --- pandas/io/stata.py | 48 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 720461f23def3..2882a7c42ba1c 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -175,26 +175,62 @@ def _datetime_to_stata_elapsed(date, fmt): raise ValueError("fmt %s not understood" % fmt) +class PossiblePrecisionLoss(Warning): + pass + + +precision_loss_doc = """ +Column converted from %s to %s, and some data are outside of the lossless +conversion range. This may result in a loss of precision in the saved data. +""" + + def _cast_to_stata_types(data): + """Checks the dtypes of the columns of a pandas DataFrame for + compatibility with the data types and ranges supported by Stata, and + converts if necessary. + + Parameters + ---------- + data : DataFrame + The DataFrame to check and convert + + Notes + ----- + Numeric columns must be one of int8, int16, int32, float32 or float64, with + some additional value restrictions on the integer data types. int8 and + int16 columns are checked for violations of the value restrictions and + upcast if needed. int64 data is not usable in Stata, and so it is + downcast to int32 whenever the value are in the int32 range, and + sidecast to float64 when larger than this range. If the int64 values + are outside of the range of those perfectly representable as float64 values, + a warning is raised. + """ + ws = '' for col in data: dtype = data[col].dtype - if dtype==np.int8: + if dtype == np.int8: if data[col].max() > 100 or data[col].min() < -127: data[col] = data[col].astype(np.int16) - elif dtype==np.int16: + elif dtype == np.int16: if data[col].max() > 32740 or data[col].min() < -32767: data[col] = data[col].astype(np.int32) - elif dtype==np.int64: + elif dtype == np.int64: if data[col].max() <= 2147483620 and data[col].min() >= -2147483647: data[col] = data[col].astype(np.int32) else: data[col] = data[col].astype(np.float64) - if data[col].max() <= 2*53 or data[col].min() >= -2**53: - from warnings import warn - warn("int64 data out of range for float64, data possibly truncated.") + if data[col].max() <= 2 * 53 or data[col].min() >= -2 ** 53: + ws = precision_loss_doc % ('int64', 'float64') + + if ws: + import warnings + + warnings.warn(ws, PossiblePrecisionLoss) return data + class StataMissingValue(StringMixin): """ An observation's missing value. From 7cb87f481c135b54e94ce2f00906819e197493fa Mon Sep 17 00:00:00 2001 From: bashtage Date: Fri, 28 Feb 2014 14:36:00 +0000 Subject: [PATCH 55/64] BUG: Fixes and tests for extreme values in all data types The extreme values of float and double (Stata, pandas eqiv: float 32 and float64) were not correct. This resulted in incorrect truncation. The handling of missing values have been improved and code to convert missing values in any format has been added. The improvement differentiated between valid ranges for data and missing values. Additional issues were found when handling missing Dates, where missing Dates (NaT) were converted to non-missing dates when written. A test has been added for extreme numeric values as well as missing values. --- pandas/io/stata.py | 54 +++++++++++++++++++++++++------- pandas/io/tests/data/stata5.csv | 13 ++++++++ pandas/io/tests/data/stata5.dta | Bin 0 -> 4924 bytes pandas/io/tests/test_stata.py | 10 ++++++ 4 files changed, 65 insertions(+), 12 deletions(-) create mode 100644 pandas/io/tests/data/stata5.csv create mode 100644 pandas/io/tests/data/stata5.dta diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 2882a7c42ba1c..1c2ae8ea27baa 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -23,7 +23,7 @@ from pandas.compat import long, lrange, lmap, lzip from pandas import isnull from pandas.io.common import get_filepath_or_buffer - +from pandas.tslib import NaT def read_stata(filepath_or_buffer, convert_dates=True, convert_categoricals=True, encoding=None, index=None): @@ -150,6 +150,11 @@ def _datetime_to_stata_elapsed(date, fmt): if not isinstance(date, datetime.datetime): raise ValueError("date should be datetime.datetime format") stata_epoch = datetime.datetime(1960, 1, 1) + # Handle NaTs + if date is NaT: + # Missing value for dates ('.'), assumed always double + # TODO: Should be moved so a const somewhere, and consolidated + return struct.unpack(' nmax: if self._missing_values: return StataMissingValue(nmax, d) @@ -1243,7 +1273,7 @@ def _write_data_dates(self): self._write(var) else: if isnull(var): # this only matters for floats - var = MISSING_VALUES[typ] + var = MISSING_VALUES[TYPE_MAP[typ]] self._file.write(struct.pack(byteorder+TYPE_MAP[typ], var)) def _null_terminate(self, s, as_string=False): diff --git a/pandas/io/tests/data/stata5.csv b/pandas/io/tests/data/stata5.csv new file mode 100644 index 0000000000000..cc597582472e3 --- /dev/null +++ b/pandas/io/tests/data/stata5.csv @@ -0,0 +1,13 @@ +byte_,int_,long_,float_,double_,date_td,string_,string_1 +0,0,0,0,0,,"a","a" +1,1,1,1,1,,"ab","b" +-1,-1,-1,-1,-1,,"abc","c" +100,32740,-2.15e+09,-1.70e+38,-2.0e+307,01jan1970,"abcdefghijklmnop","d" +-127,-32767,2.15e+09,1.70e+38,8.0e+307,02jan1970,"abcdefghijklmnopqrstuvwxyz","e" +,0,,,,01jan2014,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","f" +0,,,,,01jan2114,"1234567890","1" +,,0,,,31dec2014,"This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted","2" +.a,.a,.a,.a,.a,29feb2012,"!","A" +.z,.z,.z,.z,.z,,"&","Z" +,,,0,,,"1.23","!" +,,,,0,,"10jan1970","." diff --git a/pandas/io/tests/data/stata5.dta b/pandas/io/tests/data/stata5.dta new file mode 100644 index 0000000000000000000000000000000000000000..4ee2ca902e757a48a0da219ebc6b70b13e1ff0d7 GIT binary patch literal 4924 zcmeHK%TE(Q7@q|bIT%kK)C)rhL=)3=mm-!Bg(3ytDBuH0)@^s&U1+yucX(;ymK!9- zN8-sx;=$L!Kfr^;vnD1UJ$N7yxI)w@iO4!LI}w0xR!uYSd2hXYHk$4kCmUR?&?<|F@ZE0ghg-C4O+uQ*j~aRlQYvLBcw7p zypjzWzKZ`v)30T|`2B`rlF z_NLJI9O|O>)@JaH4R&|&+qS<8!2f`Mj%NTdEDoBZU>mu#ijwQP#J2NGUn}vOU7n5+ z{-n0klMlb6@)USWYh7S6VB=<)T=^zV{XcU%35to^3+&P)x{~W54wz;>4bym44|A17 z=A2&%{1PU%wWJ6Fo1<{Vvn>(sC$Tx`)L_2V7Z?#js_g zkww+&;%%|^qaE?iu4BhfoIG{FTU%ptRz`U~#iMX4duc(CbEGzCycjGl|E9~bNGn1bRYZOf1#W;Th^&^RkkmjFLBgWuHx~Td bAP_iWcu+CLXj7O-KH|0IQ_*fY?2*tf*7&hK literal 0 HcmV?d00001 diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 76783147b9c32..7b2f31a904823 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -43,6 +43,8 @@ def setUp(self): self.dta2_13 = os.path.join(self.dirpath, 'stata2_v13.dta') self.dta3_13 = os.path.join(self.dirpath, 'stata3_v13.dta') self.dta4_13 = os.path.join(self.dirpath, 'stata4_v13.dta') + self.csv14 = os.path.join(self.dirpath, 'stata5.csv') + self.dta14 = os.path.join(self.dirpath, 'stata5.dta') def read_dta(self, file): return read_stata(file, convert_dates=True) @@ -296,6 +298,14 @@ def test_read_write_dta13(self): tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) + def test_read_write_reread_dat14(self): + parsed = self.read_dta(self.dta14) + parsed.index.name = 'index' + with tm.ensure_clean() as path: + parsed.to_stata(path, {'date_td': 'tc'}, write_index=False) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), parsed) + if __name__ == '__main__': From 8ba7a357c4e21e6f3c05ff2fa6393424a3c581d4 Mon Sep 17 00:00:00 2001 From: bashtage Date: Fri, 28 Feb 2014 15:39:11 +0000 Subject: [PATCH 56/64] BUG: Fixes and tests for extreme values in all data types The extreme values of float and double (Stata, pandas eqiv: float 32 and float64) were not correct. This resulted in incorrect truncation. The handling of missing values have been improved and code to convert missing values in any format has been added. The improvement differentiated between valid ranges for data and missing values. Additional issues were found when handling missing Dates, where missing Dates (NaT) were converted to non-missing dates when written. A test has been added for extreme numeric values as well as missing values. --- pandas/io/stata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 1c2ae8ea27baa..e885fa6fcd990 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -343,7 +343,7 @@ def __init__(self, encoding): # that can be represented. it's the 27 ABOVE and BELOW the max listed # numeric data type in [U] 12.2.2 of the 11.2 manual float32_min = '\xff\xff\xff\xfe' - float32_max = '\xff\xff\xff\xfe' + float32_max = '\xff\xff\xff\x7e' float64_min = '\xff\xff\xff\xff\xff\xff\xef\xff' float64_max = '\xff\xff\xff\xff\xff\xff\xdf\x7f' self.VALID_RANGE = \ From d171d96fd2d3c19c329ed16f40e5d0db0032d5b9 Mon Sep 17 00:00:00 2001 From: bashtage Date: Fri, 28 Feb 2014 16:15:37 +0000 Subject: [PATCH 57/64] BUG: Fixes and tests for extreme values in all data types The extreme values of float and double (Stata, pandas eqiv: float 32 and float64) were not correct. This resulted in incorrect truncation. The handling of missing values have been improved and code to convert missing values in any format has been added. The improvement differentiated between valid ranges for data and missing values. Additional issues were found when handling missing Dates, where missing Dates (NaT) were converted to non-missing dates when written. A test has been added for extreme numeric values as well as missing values. --- pandas/io/stata.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index e885fa6fcd990..74e0326dfecf8 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -154,7 +154,7 @@ def _datetime_to_stata_elapsed(date, fmt): if date is NaT: # Missing value for dates ('.'), assumed always double # TODO: Should be moved so a const somewhere, and consolidated - return struct.unpack(' Date: Fri, 28 Feb 2014 16:23:35 +0000 Subject: [PATCH 58/64] Disabled the big endian skips --- pandas/io/tests/test_stata.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 7b2f31a904823..8914a596c1566 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -159,7 +159,7 @@ def test_read_dta4(self): tm.assert_frame_equal(parsed_13, expected) def test_read_write_dta5(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], columns=['float_miss', 'double_miss', 'byte_miss', @@ -173,7 +173,7 @@ def test_read_write_dta5(self): original) def test_write_dta6(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = self.read_csv(self.csv3) original.index.name = 'index' @@ -206,7 +206,7 @@ def test_read_dta9(self): tm.assert_frame_equal(parsed, expected) def test_read_write_dta10(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = DataFrame(data=[["string", "object", 1, 1.1, np.datetime64('2003-12-25')]], @@ -245,7 +245,7 @@ def test_encoding(self): self.assert_(isinstance(result, unicode)) def test_read_write_dta11(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = DataFrame([(1, 2, 3, 4)], columns=['good', compat.u('b\u00E4d'), '8number', 'astringwithmorethan32characters______']) @@ -264,7 +264,7 @@ def test_read_write_dta11(self): tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) def test_read_write_dta12(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = DataFrame([(1, 2, 3, 4)], columns=['astringwithmorethan32characters_1', 'astringwithmorethan32characters_2', '+', '-']) From 040b73627d23e1749a624e274c90f10d76d9a1d3 Mon Sep 17 00:00:00 2001 From: bashtage Date: Fri, 28 Feb 2014 17:01:44 +0000 Subject: [PATCH 59/64] Fixed legacy date issue with format 114 files Added test for 114 files --- pandas/io/stata.py | 4 ++-- pandas/io/tests/test_stata.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 74e0326dfecf8..e360fa2a86771 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -48,7 +48,7 @@ def read_stata(filepath_or_buffer, convert_dates=True, return reader.data(convert_dates, convert_categoricals, index) -_date_formats = ["%tc", "%tC", "%td", "%tw", "%tm", "%tq", "%th", "%ty"] +_date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"] def _stata_elapsed_date_to_datetime(date, fmt): @@ -109,7 +109,7 @@ def _stata_elapsed_date_to_datetime(date, fmt): from warnings import warn warn("Encountered %tC format. Leaving in Stata Internal Format.") return date - elif fmt in ["%td", "td"]: + elif fmt in ["%td", "td", "%d", "d"]: return stata_epoch + datetime.timedelta(int(date)) elif fmt in ["%tw", "tw"]: # does not count leap days - 7 days is a week year = datetime.datetime(stata_epoch.year + date // 52, 1, 1) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 8914a596c1566..e82d83bd41839 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -301,6 +301,9 @@ def test_read_write_dta13(self): def test_read_write_reread_dat14(self): parsed = self.read_dta(self.dta14) parsed.index.name = 'index' + parsed_10 = self.read_dta(self.dta14_10) + parsed_10.index.name = 'index' + tm.assert_frame_equal(parsed_10,parsed) with tm.ensure_clean() as path: parsed.to_stata(path, {'date_td': 'tc'}, write_index=False) written_and_read_again = self.read_dta(path) From 4f719ad6fd167a065d23f977ec2cc9e25812d074 Mon Sep 17 00:00:00 2001 From: bashtage Date: Fri, 28 Feb 2014 17:02:38 +0000 Subject: [PATCH 60/64] Added format 114 (Stata 9/10/11) data file --- pandas/io/tests/data/stata5_v10.dta | Bin 0 -> 4924 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 pandas/io/tests/data/stata5_v10.dta diff --git a/pandas/io/tests/data/stata5_v10.dta b/pandas/io/tests/data/stata5_v10.dta new file mode 100644 index 0000000000000000000000000000000000000000..76de9a40090a7b43e6e4cdeba156bc8cb3e36bbc GIT binary patch literal 4924 zcmeHKO-vI(6n+c-a6r9yP-DUnQi2I-+ERXMh(dvaii+~1;z76F?GN;ic99>6TcU}O z7{%zRYU07agEtdmqQryIM2(4Fj0X-R0#_R~5hJqB%+5kv+Um)emX~ziyf@!_JM(4U z%run8K>-wkDvdwRb6eg-VGQeS7oJ1i&THJ0ktjw0JW>>epr$WIDOkdo5Ayr|*76O| zRlc6)C>yTaM1CAbOqgnrO|&DE!DvRts@-M-{9OFCq%3{o>_d`x3JgY?CspI|OL#m3 zz#H(pGG=gk13b==UW7ou-HT_DRsi@s1;>OV6u zI5f)7!V z-Py~96XxVqTs%J6Y&2DvE32w&EC$FKE{x(F+YfR3B6ic(;r4`)`u_#F`7pu{Zsc(D zA>JX0p>R102ax3ECFBVq43yj=^6`TnU%wA|MZZgOqo5e_c_c{`kbMZXOT5JE(SM%Q z%^1_&-%sb^ukpK9$=zLGyh@2!pV zSE+8eQ8GXPb%SpD9+~)jiM|hlTh+MR_B+(p@;>NM@sXllnpeXk_MNfI;}lN^;~EWL z`(4eit)uL|iYIoxtjKs$GQ4A4?7&>$v!j&d$t5njagDI^f~{WV!l*Y@5JxU?QM|&1 ec5V;~WTT;n_ZuzM24cmDwzuM3w3`hzqW=vqeAQ3@ literal 0 HcmV?d00001 From 266983d7a555be3b2ad610f627c5b31b813b4313 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Fri, 28 Feb 2014 19:06:43 +0000 Subject: [PATCH 61/64] Add test for Stata data with file format 114 --- pandas/io/tests/test_stata.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index e82d83bd41839..1aea3a67394dc 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -45,6 +45,7 @@ def setUp(self): self.dta4_13 = os.path.join(self.dirpath, 'stata4_v13.dta') self.csv14 = os.path.join(self.dirpath, 'stata5.csv') self.dta14 = os.path.join(self.dirpath, 'stata5.dta') + self.dta14_10 = os.path.join(self.dirpath, 'stata5_v10.dta') def read_dta(self, file): return read_stata(file, convert_dates=True) From 4ce821b1a886d0f1a9ab664b13dfd1068034b6bc Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Sat, 1 Mar 2014 17:05:46 +0000 Subject: [PATCH 62/64] Added additional data files for testing alternative Stata file formats --- pandas/io/stata.py | 11 +++-- pandas/io/tests/data/stata2_113.dta | Bin 0 -> 1490 bytes pandas/io/tests/data/stata2_114.dta | Bin 0 -> 1786 bytes pandas/io/tests/data/stata2_115.dta | Bin 0 -> 1786 bytes pandas/io/tests/data/stata3_113.dta | Bin 0 -> 12737 bytes pandas/io/tests/data/stata3_114.dta | Bin 0 -> 13255 bytes pandas/io/tests/data/stata3_115.dta | Bin 0 -> 13255 bytes pandas/io/tests/data/stata4_113.dta | Bin 0 -> 1528 bytes pandas/io/tests/data/stata4_114.dta | Bin 0 -> 1713 bytes pandas/io/tests/data/stata4_115.dta | Bin 0 -> 1713 bytes pandas/io/tests/data/stata5_113.dta | Bin 0 -> 4628 bytes pandas/io/tests/data/stata5_114.dta | Bin 0 -> 4924 bytes pandas/io/tests/data/stata5_115.dta | Bin 0 -> 4924 bytes pandas/io/tests/data/stata6.csv | 6 +++ pandas/io/tests/data/stata6.dta | Bin 0 -> 3048 bytes pandas/io/tests/data/stata6_113.dta | Bin 0 -> 2752 bytes pandas/io/tests/data/stata6_114.dta | Bin 0 -> 3048 bytes pandas/io/tests/data/stata6_115.dta | Bin 0 -> 3048 bytes pandas/io/tests/test_stata.py | 73 +++++++++++++++++++++++++--- 19 files changed, 78 insertions(+), 12 deletions(-) create mode 100644 pandas/io/tests/data/stata2_113.dta create mode 100644 pandas/io/tests/data/stata2_114.dta create mode 100644 pandas/io/tests/data/stata2_115.dta create mode 100644 pandas/io/tests/data/stata3_113.dta create mode 100644 pandas/io/tests/data/stata3_114.dta create mode 100644 pandas/io/tests/data/stata3_115.dta create mode 100644 pandas/io/tests/data/stata4_113.dta create mode 100644 pandas/io/tests/data/stata4_114.dta create mode 100644 pandas/io/tests/data/stata4_115.dta create mode 100644 pandas/io/tests/data/stata5_113.dta create mode 100644 pandas/io/tests/data/stata5_114.dta create mode 100644 pandas/io/tests/data/stata5_115.dta create mode 100644 pandas/io/tests/data/stata6.csv create mode 100644 pandas/io/tests/data/stata6.dta create mode 100644 pandas/io/tests/data/stata6_113.dta create mode 100644 pandas/io/tests/data/stata6_114.dta create mode 100644 pandas/io/tests/data/stata6_115.dta diff --git a/pandas/io/stata.py b/pandas/io/stata.py index e360fa2a86771..2ecdb22a5cc7b 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -97,6 +97,7 @@ def _stata_elapsed_date_to_datetime(date, fmt): # numpy types and numpy datetime isn't mature enough / we can't rely on # pandas version > 0.7.1 #TODO: IIRC relative delta doesn't play well with np.datetime? + #TODO: When pandas supports more than datetime64[ns], this should be improved to use correct range, e.g. datetime[Y] for yearly if np.isnan(date): return np.datetime64('nat') @@ -154,7 +155,7 @@ def _datetime_to_stata_elapsed(date, fmt): if date is NaT: # Missing value for dates ('.'), assumed always double # TODO: Should be moved so a const somewhere, and consolidated - return struct.unpack(' """ - + # TODO: Needs test def __init__(self, offset, value): self._value = value value_type = type(value) @@ -370,8 +371,8 @@ def __init__(self, encoding): 'b': 101, 'h': 32741, 'l': 2147483621, - 'f': np.float32(struct.unpack('9q{gU z0MPMPq+h|%*viO+;s5_XfByXb{U;@{B()?nH#I&PtQwPmD@e*r$EpaV z08<;1NO@{%c1~qHZgsi&c_kV6R1}sb7L}wH;Z~B7n3EP?nVN`G2|hU_ry?^|4Ndh7 z(jftg!cJj8h?kTzz}1xGBC!jR*cnLdN;n(Yt`W`dh~o$QzhU94H|?cPS?-!#5lmJo zx7b0VGeolwg6L(F%?^rvU;g*V|99V*KmtJUpdLm)tp~x>)TdyPQ3FTAkc?ylNlY3y M-vSdM162|u07r08-T(jq literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata2_114.dta b/pandas/io/tests/data/stata2_114.dta new file mode 100644 index 0000000000000000000000000000000000000000..df9c19ee0a9762757d775f5793760b67a2085ebb GIT binary patch literal 1786 zcmXS7Vr1Z8U}k^-DIf*HCv6#jgG92EIvBqEY-W4_#A)%_3>ExUtaU&PRsxG`2FbWcdI8&!0cPfB#8IEJ-cN z%uS6?2C2gW;0ls5)3GW7DZru+Ayl54nw?V_k6T@CeqKoiJ{5(fiA5!;MYxq@B<7^W zSEeT7RDw?q;ZhWqYDqGKK{`Z8)zDNA&Q4)KuuIAjYE(;{VI0+xLKu@lwIljQuN@9CM%+UJ-LA3mu6 zq+-;+Tjd>^XfM?7`7eBD5AVVKIi}&=`~Uh2pFVkk_Tj(#PuSwg3l>fPhtDjSK6%cJ zX`z3IXBhPZQ|HfH__v=>KQMFNlD~al`-U0wmn{0*PpIE8Ep+DJzRyhkr#DsDT+hD; zYws+YGIP#?$&04{?Z+0+n?Cm+vcNE*^Z)T}hM74}n)A0`ufFkTqW(S&{|IK7?t}Vx z%=qgw|I2Id|N8g;2VVal!ua>?|GzTu=Co0DcupmGwjf?OZS_b{ZagA3Rrk$Yb`#l*fm3_Z0qTT>Dsk`9XIo9lb_g%>71Km-bYU>Yosfdk9EV=HD;;H;4byMr)RT zJLP{D2Q@AGSY5(&7pnYUlN=$pBjq`647fMl`VvA5rl0&@0s3Da0%+mcZ!My#7J89q0)eAviZ<4pU&#~zVoC^;Ll{%jlDv+<9&e%eL-qR)tcp` zCXmb!UXs%I3%dS-O;t<2c2uG%oz?sF@T5wITV!#|?E=Tu2;AwHM3QRF9A_(ZigMK; zB?tP$v1Yy;*A-R_6GJ_z?0u6k9@~U9ho0o)P>daD&t>9zRMsv({otL1Lx$zMW znFZ;rvD&3Cm7;Hw3%6Gb8wb`2T*XT=I=9X&J2piuEf9VwA3kh=;D1{-As*9yR_EQ{ zgGx(ok}57r2z>llVC-L$+sjpmo#2q!29WFgi$4ayN?AdspC`Yc)jO=>s9bu7eAOvl zXmELox84z#peJz5PP2lIm%o=aP^j;fZ*mFx7*-59(g4ccy-Vzzz7ZN8)(gte4;VO4 zV8tGFanP)Ej~;}t3NQu5p8~|_?=+Zl3m*`2;#(oh@=jnn>uK$%r^r}yL6UkPW9Rr6 zKHeHcl4?vDtgd>Y4^?V^BUxScpbH zpOCH*>B8>s8xdxc74@-sj!F0@L?ukt=pm>P3y!(MF{8fJOu4Sl@$I{X;I~#p-rtCJ zih6EV0vwR!ftr;uhM{nJxCxf{1FUY^C_<%kFUas42ZXHSdcvG%sZEt?rIjEjc_U+a ze8je)fU~CBQEdUy_bd^?u~#JQ^GxAkw2sKl8^g*=SZRY;%*fbLWXuN?_26$W-{yOO zH3UWtr*imPV%IcB=$U6tq@6aDVd}x;wnFWw@tR|6KY2Ylm8dXjXE4OZ?BZc%uZdyo0Wk)y z980+hD;n|Ql#rOl5J_x9!+hQ%mdnjNV0GP$ku#wm01r3Rr6*J9mL&A7I4={>P#&Yx#y4K_id@dR~ggx6j9Ox93-C6qGP9O%FakA! zfa=Hbhgn_KqeWC1)0*C6YJ@b`&P0mt4J$@i>4Q#Or&-DGFGBj-A;~qS!>s<^>qS(a z+?I~)cSDG(b|OlxE38;qWXFCWOqd1{kBWfLfWY?gJbSCm< zcZA=_A}4u)@EbMA&X4`TvFW^z%{W*wMqOG)x%Ta7@vQs8TRS**74t^2M#faROV*Nn z{h}YN?7*-X$EV%-4or%)l3mj`|VHgY4GEIZ{3?GJ6Y(~a0DF7C>Hir3EXW3h8}1VS)`n_;wa9 zTLi~WAeISy{HbA(j!lrh1E%j-ostqsrHkF^eOHEz9odmcM==Sw=79Kv@TWA$K$qdj zUL{1yq3}CapJ|Aq%8eehF5tc3^Fw2E^EAa1@nwF0{CJy&&hcAyTFf zGS&u!8HthfLW7LI{t-A<#%KENG9AGVV}z9^;MI~kp)%J2qT~y;YzN0iBIIj;MDyuV zfN=Wdd2PNNVfBF)Kq`9E-Cj?Gh4%J@d2NQ3_DFIdNK%PdlK8-;BVZ*GR-E}Gtf9q% zXey6#qt*e>g^f>|5-EWI;)Gbn!2dI_lExn|9RVHK!{>I4f@4NUE5vf$mA>Zg2`Rms z68U!lkWQ%nIDG6ISjpo9mX3rz?@~Edkj?6xv({rI^`%u2Iude-CCU{)(6Tce3xY(< z1jq9ERQr!X9tNzgFlDp)-AiJqRNz6|ue2hyc~*c72c!!e3qkma8f4)9k3pWDsEY+w z46*SWsMMn$P32iK-o=V=Cq9B>U6JIekcb3utcV{yJ`mv##p_$JVsv`80aMJ2hFKX% zy|oVH9|4YaLsv?u{!$H6{97Ozd9d1UrW{t+bKXYEtRG0f7-ma`$LWaF=VR2R2l{0; zXgO62zkJFlP5NxAx&orVbbb?+8xNtGHywz$6Z6G+6bRE3(l-yhT8bne$IPNH0_NP{PKA?42=3(~o%DCGb+wi@hZF#nTyIeZMr<7dw8 zxDi&2(s@{^r_^&(FCwF%+5dG@%Bq~27bX_ASlHU5IAUB^3E7T&T zMWZVlH1enX^eHSlnf-6SEj-E^TmllQoJXnN)q&_gsz)r7QT+gm)MGsqMvwWQ%`aX! z7IBTo`fw0djJZd4Q00n+CTwj**0n}mib7yzl!be~9<}%dj#cnu>&Bu(oy|TrenrPv zUC>Y4(f5RQ{ia^%JL07v{W1ww#(rLjo2>KYH7y zPIx%>xuA@h1;}`de0?K6b_B85ncKV>hwz_3`d$&n{ZbIiXj+-lmN=4E0(Tdy12YY!+=h>(BmC!lz4b(p|AKn3f)&G7`&}3oW9ayY zZHeucdVy;ahFE5xUv{ASyEH35r%u%3s_&Hyi1Gc9-LNu_hQDznoxEP5lncP*StzB9 zjBNqO-tZOfpFsx>!^-T!JXYt}F%^SvB0U)AOnRE%2+ZpRa4Zy7cB1#UB4bba_J==1 zzu1DwQAK&I-gfdnD$o3kHfr35)Q|r|Q1TZeV+&Er6b!pvh~+Nd*GHAX{g`Hp3-VY) z@T3eXul}6Aj~qzCl=p%%Cmb1Df+X*OW5H5 z_yOUU@<;wqAgHP4u5IPA?wli1lVE{3f6n~F?nMoaM z1Bh)u119oXSc$<7lMRm9qpkA!u%9O($wo7j-*kf2B{n-sIm2u$@Bt)dvVm|PB4K4C z>XM5VYpp??Lqjl<%;^2zMJI3;8*-E~w`NhFSpg(okNILB2`gK`v3v|Z2OQ&$@(y;B zQKL@*Sp$e6bHgzz^_fe*n(t4pUo;ZwM{FBXf<=-FK*gS#6+2}zWY8C}WEGxZjrC1( zapIdxhvyJte;$xKQLvH>9V^%7=V) z-c+=k6%?kbAfGinjm)9Us#$bh$8IERiw%)aM#1WSXmT00qwWa5o-dv?4gJzt)o0&) z)|gp+j4~;6sl&zoWVCe?qO4jENG3|Dz_B@qrIx?>WE!eJSk*E9Bw{&q0#j@rtvD-^ zjrW@p#RGc)cLWMkiGG=+L0o4{NBE7A@yv_d2_i z;!`b&axfN-6r+%6$SMdeNI?j%z~{vJn|OV`2CQsJIT0j^=xp zzd%_#KnHT*m{GqBjukASQ777yw2JnGJG~LHoCcHYK>qjWeOL4R(KF%LsKVM&w&p@s z=Tel1lSMeS@pC3Ok2w+f_SfkBvw+lVS;7Wxep)gUj?LoLjl7W6zopsmg;S zJ7DLpjR&LzwEP21ZUT}xn-fRQfUa z`F$JS7(FO+tvFqK=j28 zWt5u~O)I?Skt1F`2=`8gmCF{{n@2HB zzqH2%-mg&pTl|+ej|)9)&q#-n+nISBmB-J>QT7{v2PoS zSmKH}uCMtNVi|XX%28XW*NQnLq;d#hI;6p|Y74iuCm_X|W9ez3py&{k(jQg~^3q?Z z+-f^@ahOj0f430nZ5piHu*iAtfb7?-bQi_aTSppgd)7bgX-=a#(EZgANGC=eWZg;74Y6q^>=8(-N1;Y5JftGhcOK;TWdu_Z{C=1Xp%g`_P z05L4>b`N(-Ni@nZmn?oRK>7D0{2Gg5>jTHmY2(#>!9vW+Xei8kK#cD852=hTHh0BJ za`$JEDCq|Pd7wdRN9Dr*e7@_ug(&6c*!f%VXK+R)zfo>#G7WbOBTJW@a?hant)PeOSMQn-9XINw1 z;om8@CxsSNMG%)W$k?c{St$9bRK_})m6NBPNl)SC|9_K z#1xwdHyeYHtH(lO!Bw;k6ef+|n!5z#FX3^01Bl-F{7cF`-$fkqDsmhnl7y*vtCeKcjO?Jw-|f`8GDUV?&o_pU#1=3 zuv-FR+)7_ld2TA*S@I<@*2BtRY#Yo6Y~!d=*$dR=AfJ=I41~FYeQdSq9IK1{`7M=> z@1+rBB}sVXN0da!dJTi83d4xHimFz|jtoa#Vi8Lae~#6c-grkPF^$@cTS@Nq@+aIS zY#Xcef)qU*vD9mj3(m`-W8Z4zKZj=))$b_tAdSu%6HY2a;LKBX+prS2=K&-r{x_}q ziFOeP-xf!n) z`qJTGavn-)EifVDNT%O+h~+x(wc;xbi;p!Xm#~JBZivNtAFU3bMqF1S7Te>1G{&Li zGxF!n)eSyq{t8Vl)H~P`oRNDqV6>*uM&5IYXQG900v4128;sVe;8-LayUUjjSP2~q z!0Uyk^Q^9}_B~~;?5EG3%^(v~0clf!j5R^VW~2JcP|8|7vsj6pc{mn$FF^F}<%}p- zX3*flOUc<#3t{34Q5Rc;9|{={!8bqW=Py`=-uDN`a(&OUh6UaAqLO=%rk)Qc_tFVb zB8!mZrm(Ucz3;0*yy{lL(Fq{T?Sk{H(LT~3DpxY8<=QIZc!oeOPJ@yM6vGAa&{9_Sd$B>3V-C|XH`kJkJapx>B0!qK${Nhd!J5Fn&sh!W>jzP?@-1cc zCzFh#GUz)x>p>)OvKdOGDW?&Oy})c*hq{=xjHPy7gHpcMp7NBkhWPI*BlR3Vi7*CmGb9pYtYDM48E=>OIc&fW~?Zm%%SN&#*%f7h7cv|9O}{n z=a`Ln@Grnh1;6{{8pQI_gw8bm!0HZ7vKE!1qtvE)GnqfSKT&=?i|V(;g~ZqB{a_7p zHgGM{(E)^61BiZrkB!K5$fZG_$s{>|!*W%MQaT9C<#_Z8}uD7LYntSH;0zoyR^DzU zDo@W*ckw&2^ROF{T3<&gI|+=(I`qqbGzgv{VgENnU>i&d`o+49DF1$*9-sL=i3sXV zI2JoiO(#4ZjRYxlidk(@CK}O79=qW za*+aOQuzEue?rs^R$&cU{o0F+heCJHJwtLXyWyK}qAt#GEDEJmA3!AYc8}G|qCT4R zDLB7O?I0@aFVXx>CFK5AEQ$elkg+a;5)h4rM0G5IcMgmJWG5cjE#oh;y6m5wL^-LP z&TUgpoci@7a@c)jtc$>{jn=HF_ZJV-VxVJheNVsK1c<)FuuiCcC4HE1k(6A(23v&H zf$IuQj>g>o4;+i*bL}=@yKsUiIRj$oG1XaAtglgV%@yJm+?z0iusYNLQlut19^r4| zr>xolD?gdkN1hj1Uo* z{3TX*EVa8Rt@)XjnC_6JHK@zhr?Ao;mn$)l@i>$+hCl8d3y3#1r2^9>R^RVw50U$D zi*8`A-$Q4$YtN9!5#0D`w@Kzcla5Y z+)I#4W1wS+NOCma;1~xckHy-P11mc=Ad9As=2pHLBpi5!ZDDNjt7 z@$;yMeMM1yGNK3xPsxU>XzWZ+VLed_?Vh^ksMtFFN@*mG> z@a}tL<9`MaW{VMy;Rj?zn=vcaM$sOHjhs* zY{HFV34W{4%=bsukn8CsGAZxq`^$GoivyVZR~m^N=Y<_60kK@vtawk`gp+u0m2a?O z3>e}iDv=FTzwZvY`-dNqdSG@Y4G`oBNwAWnS-JcDCT!zWeK1s5!Rq{8d5O}Gf6%q5 zcSx))7R9|RtPF&}CWA1Wv4iFF^YxqI*gg|7SWv<0D`pK8xi*YAw*D?wxKV^T(ij;7 zq#*@^Z!sJ@$2(2jjEu#qzajv{aP6zXqI63qT87pVv1SyJzik4J4Z?ic4TT9csf%KG z-e%OL2gYk995Yhap(2y07wz7>AYSu6A@WRHBzdsF346dXNsA?b-J)qSe&SGch1IQC z;Uh{{4PpdePa?XFB{GZIDGh;+rD;~wy7-UU0xPXy<@>@btX{106_q%n=pW8dm!NTk znPVr)j`(fDrhS+%G91h1Gxu%56q|wDo_4Te=;F(Z++i#6NLM{gZTlIKCt(nl;qhkY z3|P60j2+|`TB|qM)4{hluwrz6XvU_*it~Ro((clDB7fc-5EFhElYyPz7EC_KUk~02 z!dR&*oKGdITXanjnW5HVX(~(YMohrdH`R(6j|lgp_ucT#`}w*1wql2g#1KDTSjpn4g)w2j#Ja3eY} zDHsb$b5!3Ca&Z9CHyOR3!pBe929C8b+f-#2RkFsietsfz$3|?kwh2wUJCP`ZTOi4P z0=M-b$iE7)B=ecaw!uo7FY3f!#oWI%3_4~j#>6(IIh`jH?v(=^^GA{oV(@JOdlMk4 z+qLTB0fgx)tG^yI96L-?@x>=i=@G9iCy zi;9h%=svLtb!H|J=51?mY#7#_14weR2DyJ>J7W2vP(7MnWsS#L1c=g-X5!*WEUiw0 zmFI26XInlJxaKQD3STLQ)^V^BGT}B~8eS~?DJIOn+D3)Ql8dr3U)h(%pmDVjp_da@D z-p$4@RUE}3A3lX6hcJ;#z~p4Uv}Om6qWwY3U4_?Jz5TbNuqU?=w+8-!(&5bF<|Mi# zPedbUX=#R{*pdSk*c zk=fQvJkshYeyO&SFiV_7`(RmZ7nwi*xZywMzZW=7;-CMb(FEf~UoWeJfBrwi*yi`u9XZuk$Gl#x3D+`=J(H>V zts!wSUNSxtEy5k~_v?Y0Jf@RJr{R<4ck1iz(XW%c*Pwo0%)7Vm|MDkd+NAma^`-yg zf5R3{n!j+`-+X5Nv`Mq4PYwMa!RkMnKQLw9+yx)~g!uzA=7xXteMY627ryYLpD=%5 zYUqrQzRyhgyEo0SIUXMetM4qFJY)9!Neica^ka+WPMh;LQDB(Rd4Kyh!_1f~&Hm`u zo8R~|P#+J&-+~#Y+rZxL)BpXM|Le8aU;h2CumAF)Zh!svkG}rzPyP2R@P733z5nyp ze_s4=KK$jo{+Eyb`S$J{6?;Cxqi_0BI`Zbf7%P?cpP@dI$-pTk$EO-iCdw?($+cD!=}T)a}?H z`0aWrDDh8-Qu>>bSz!&9g<#pwd!+RhoP+xC9i!s_(UX?FsnoxUY&^78NV9w<$TNN? zXI=urSPMZ3>%fx5mrdq`Fz;C4eAtM%^gL6!Em$)c891&*s0xHB(_B-I)@&RXag?V>`8 z5A=g!&3rhn3s`g$LOrPLb&D|WTZA=Z9t-m8mxN*Jj8e6=;B95E8s@UbQ&37>`AEBr z{4`cy<=lr#F}KLYJ1d3t1L_2>{3RKcQ)iSNn!=YBaKD6)7}{U(yCWOmk6}Nn@#^PJ zrSMy%l8Y7s9zPZs+XixHsTpD;*k#xQa)W>IM}M%C=4beN@cUV<-3pG%C3neJ9TS9i z&QI~yTjCt_1cuqDEDUp-zn?iksPCC)a1I&?7G1W~pK|x^5gW&Egm*Rdf->X-vi}4u zw#bW}%F-=nAim1q5R`Bl5WSz{Aj&OxNXW@=g-p|1f$5~BwSi9&vF3s#bw|X`^Dn%; zREQ*18`4=#)dFv-)c!^?JMTlfw1oRyN9{6xH%d|qVN_0UHDdXuJL2-yxV5Li8gKo?$Zi6S0G!Hwzy*$(c`QbwvS2s^mQ( zT_V$j-QQahW`hOwwt5a({s}P)CUewa*Pl_OR(6(7b7Be3=#8&MBVv2OSkwOV08h} z!>AnbhS)UC7JB41BGOJP$}sg%avPyGaGYwG3+n?GUxPX52UvYXGk?l$en%#dW5Vh- z@TJBYEU#dggOK4nNe#00fRB(8>d4z=tT7y9HBa{hQ04+dpPfD>JeZ{?a&9v!C%rN% zrEQUm5S1mz$Djt;%zBX3PHp@t<(lYd;=Me9i_;N#S4&#ovjLFyLZn@=S}%$B3{Zk( zxU;k?Ai9+gK7(K^=<1LnVf6~6GOiUp^RfYg=?M2HfhC!b-Nl1tuK~w)K=c92$55`^ zf=0eLEo@I^h$Pz6Fz+|;r4tMbRU!6XMpUeh_q+3x;Sj6QpZ%OFrHyF9rfb4E<~^EH z8)W}2+C^s+awM;m?%-E`Z$$0yF_;B&h}8xa1fyM8(ZLDV1ee9{1tzU6t=#ht;_rgq zwFtf>@*9VnP&=K?et|`oK&DV8)0$Qc{6&ZhLd0r1P>%ZpEZzUj63-9*U;@h&)Qbr$ zdauGSsghz#+Zk>N`D`O1J9UO(AHd=a`{R(8)%?3yK^2eRofC!(R&y?VF7ncX4(|9& zc+ss1ky5+RoDLe3QtEenokC+A9Z0qno=ved}*z4@lGqk>3TviI+IlG0`(>%$g}yGc@!O{J+dFq zA7(X`j}}s8bZdH_sTNXQIuR+R7g+RQ>5WQUqq5}n6VdwGBFNQ-!>smx!$K-gYC}i# zy(vUjITEGT1uPaO*`Y546Q)AMVi&!=CZ!8%fhg|&jm_0EL5he?ps7%YPzn7yj|22VeA)+C&+Fk~Vx9amDN zUne?RS1{n!o(1kk-2+CRl&CZ)4)JL}=_J zd>PLtoF0nSu>q~`fZ+(MQIeymbg3JC;KGnGBRUZ27$yPN91uSU{%YxGj!2Zx!Il8`M(#ue`rUfuLlCs5|E*g;$;;QaA+7Jb{JY)1s0uURW#)e^q^Mu zS~Bo==1Ywzh<6FoLBWf*b&mRKS-cKA`Dvutb5yi9gEfTFj53a-b`1 zrlPFwMASN5q|&FHV6FOb@iaxzN>O1o;>raz7A$4MgW(3W(0q zbrZG=Z)*3xHMwE)8jIp*Fsvscwg`{}7?Z^(FCB%>xd45vCm{OZEnCoGOf>DhJ?Tgr z1m@uvfb>FMB4Ph-7`C5xem4pcOGLy@`+U!8Mo-#KnWvOi&2S=Q);ocl6#__aXzX9; zmIZJ>nU9|JDI#WPjD8eY_&uxL*j1*|HA*+W>`A<$8U$|7WI)_aOyLU5%4+0g2cP}$ zQ@Fp;Y#9GNtJ9B4qFkIvgC}$*WzQZ9()lS!Wq%m966$3z|C4w*Vl>3#Ys~4e9xQt4 z0$A!P_1Msph^^}c>1zoQ8;tC)MpJsJDtUX|XzVcEjVYmSg~u@WXM<%hp*fcv$;xPK z<9ag-kGw>~nAeEd9)4!fXXb6(h}8!W?aHzwD%TLY#?q0bwSFnc%_o6{nxxbiROLHW z{K-Fk21ZAt-<>xF$5@@S|8^?pQmS>aBic{u;mags-`^y4UyBZ-#r)6W7cCeAzs6yG zI0zPf&e3gDxoV<`n_H1Jt&x|)5U>QAxaVt;i(g<^IX|Xu3@X&g=xya&c%0P){j?Qz zPiU8K>V-bTUkcJM6TvbXvRsF6&ICkf44yC+?x&-WC;1#_wXq)(u_*e|J63f<&6wwc zGI}N;<4p36_4wFP_+n$Ue?1oNKSAqzQE;5qC9g}OQsOXr!a+l{v!F5O>8O{_k^QgH z$1WiIjf`UAIOL^V-T9BBkHvpOxs(31TdJN|w|FBkD`vo#i72rRP;VNtuQ!frGaiL} zg-2x;+E)0v&bX-jvV!f$e z;F^TNm+7dNZOHyEmF4G@394W9J$C@2e?NFPSjN(b*AAqk=PRUgK9oEYsgx11P0-kD zzTE9|^nt@*nN^U>Y8*PGVDL?#2jiVc594crX_ybgLcy{VwZ9n=d&;*v{5k5y8cL2X z%w@IKllD=0#^=~_JIdwd^na~s@`X|^aom(>MNOsDe7 zFX;QI0VGU$FDSDk5V3Fsc@GRbfgs=H=R62f_Zghc47r$PeM!+U6!L;) z@MS3=`_TH%s~U4$`UNC<*DRQV6RgJj5gu-f%0N2v5(qBdWc$eeBHmXRyr{%Z8GLug8sYPUSIH5f*AFm{aor0@i* zlO|`Okf+nSAU~34ttHCZRd7EBC6=iUK66hFp%YvODuMnENILYWtGQ={X7vt)*I3J4JTR6_GZT@r<;WZ-k-!y(h=@M6j;_HFF7c& z)+)p)Gz25bh}!Q}coJu^!N(|bdnWas=}!{0m@l?bVA%wXyA^Y_Z<4wJU?>dlAcIEV%v}sO_G!kDfUoVY?Mi8gFf&jv*0AFuWyor6W<&< zESnJ93xM2>2Fnigu|h=5ph6Zcm<*N~h?q-Z9;?aQlY?)aL;ZaP62vwpvM~m!+yzJh z$|?xmvV`B+YzhQ31Fy#yT!&tr8@qp~TpVkTYFp)1MUWJTmt(O}(=o?MFUs2kj`=Zj`eMZI(~_h+9xR-aLI zoHEICsNJQ0WK^RjL|L&GkPM_!fnl@ZOD%uv$y8*2khzcXr{K$>lbB+2Y56&ktbfpy zDDKz;xTEMW6{weqD#T^_G`QawK~4um8`S%R3JDtNO%gh{z@oSo5j$?;7E~a8i&TjJ zt7!;wFJq)#A|Semuso{#I*)$UY$&O3Xim7nafsLni2oWOGgOE}iiAF$y zEOvA0RKZA?wJiy^+6>7d=xr?@=6#pXXEo_o$o|fG^nNEdQgpf{ zQ4YqzuzZsee+#~h1dFw?J&-I&j6w>Qe6Tx2LNiCoD=hgLZ9!~=2$dASW^ z3-qm%oWh*)>H|AQf))zyzWPPtFfh+gy}8Ds597^c@QfnoXKH2P#al3L!5aA(%Tmordu9mM}0weMn_H);k93oNJ&v^Ey78t1}X zoGc=!m9G=Ib=;B2cfLmLp97>`jS@C+b}RrndoE7u_z`3qThb-9F@nfrlY?JAUjGs5~a;nxL<0LL>&q9w!x9KkuhQNEYyAq zmfldlkkvS@D#5Kn6s-uANJ>v9+{0i(CtboqVnO`wm}~zXUvh00RK3-FKL&`l=v^u0 zCdSZm&$;BNXLrKAmBDhwBzv*M+o~g2p7QqNXJhW0@BOL((dECnM42zwQ2&Ajq-;|+ zqD=i3kRMG-tr^lq)mYW*+30BtdGlB*#BP~>g(~i`bbY}*%t{x+{JInMQjSzMfiGG_ ztcJh5cnL|45Z6=<51(oGe@kvtlO%a}9a1!EKygg-F$Nz_P=9;sZom zH1jHsZ}D{X^;sm=$%DvycY&o65o<|OLhq}!@4wi*XZBE2S;Xr4gcr;k_<0jsqTyX*|{(L@3*9;)sNX*XCD%49m zT;TnR&VQR9y=UIP^TP0_S&dD26_ws>pu=v45vEapBKf4iup4l{6N!FQrb2Angu<8j zLXPWWJPlvQ-lTH$ChECtHVLU1OqlkmFs#bNZSDa`k!o04YA7T+7^(CFi%wqr3zb`K zrOtNKh~MufBE3lk%T1G<>jubvm8F{)h6?Qh!MFpWzqRfb<=$?k?dDA*&weuy#%>>c z`2~gCpWtc}5xc=>UkJnCTZa;hH=M!Ff8jP&3b)equ-W8tGb7>JrGe#+Nq+4`I!0%y z<8?JZAHHmXVM_qf+PU7N(&=ruR+~*Wo)QS-mkL?lgDky}m+#f_TCU7Ty(~e!+y_Lr zxa)n~DJ9Wp-5j#$xq!~UAMRJ16l-r7c3vH?Zu1vlR>q*iyaz<@W?MsLY_U1ZmXmuw zi$qB~0LViXQX7~9`}6oNZxD76hGCg}h{Ynz zm-W~!^ML=vh`@cldY8 z?MbHjm662xEFw1bC=7dMQf3oq%zRAP&yVv8N4>;i9i7XcV>RV3o>OViF3J_GBC$mV z!p*`U6&v#LWfh%!mNpKR+f-kQzUmzYy zt3uX~U5vrE0jc~F5M8&Euc(r|ho(u($;=lN*B#mLTO;#Cwr&e+e|!moH{D z0bva|@9(8yO+)bUp+qv}P}zY6WE5CVsP(cV_D0`Cck)~$>Y<%)C~ zJZLdF7iuC*d;#)e4fjLQ#zXMU&-r=tSD^O&ps^gE3#@K_H?64T9Hc21BFOzTLX@aN z1i2|#mZJ83RETHY3K%*bg1M7_fz{ha=|tsf1~pw@K^)E!w2L#4r43kC5L}_FhSiK& zsZKFm5DzI~HNO|>L^<{_9eryxNzX-9PAvqa8CX_fRt{1d?ECDMXnlRrlq`HoSna7K zy{HU4LT5gVB92x=h&1^Oe6ban4Qr4WqZ+Z4POFg02K6aV39FmuZz0Ovvgm>#&R%GGS4F~EpU!m zkEd_~Sjzd`FIT~rmj+a(;RjZ8XksH#DLh84sy32&qxuo$*K^2zOI%2NjoJ@ZA?E^C zBOL7^m{ovi`+Hl7O#2)f^tnuS_;XmUN{~uBfw_`^dWnJi=lRRWS7VzEFrUQ$(e-p{ zBFbBHY5kUMq<-K)!WH}gNGm*2OMu3EH!2iLG%b(wHkS%mL^Aud$G%tB~LmW=Lx^SRX+2ZC5rEm41aZ zu5>4=vKOwP%OK0PcnBGf+TW}CGP-398muo~>-bVuQ}i!ek$Y7{9ezzF^;g`G{c_ZP zdnh-an9uv>VbNz!3|J&Qve{=SWwoB~T8K)|voy?WKQ2|=iDGvR-O^D|tm7cd-6|xc zA_haerP^UiSzVN1Cn}@Q(R)Ql$eccIgnLkl>~}z3%(C32w!YwDYXA`dXJDAV;!Z13 zd3v6@iATuJ!>&YXeFLfNC@}78P%q!95IjS|{%=HseP>WmFOBR)`S%O-#EkDrWKb`{ zvDj&+_I$EYmk`-_^?SoE0;+u{u-(QfLlx0^Hxg zPhPPOEI%2{k3282`r*C1ipsbuT0FLjeAyVjyr_f5x(c#aEY{I@m8IkTbr8^NL&xZL z{AE^iJf)i`t@@c38}5?D)yT`{r(o%Z%avHP@pz;%mOtSY2Z$ForF_F>R@?V!cai&W zn{Hfmhv)%eI$_($>MlrQVj-A~D&)-GIJLvrW%K}xF0Glf$YtH5!5tou2a&xAcla5U z+*6QCV$sL8BgirQJBN5Ec?{N`Y_RC{4z8kHQ%$>#dIG_qD!-~nA9Kbd=y-IPRMg07 zzV2UUNPoav^H*4nTf8gU#c#Ce%_CAi+=XxlFx0DiA^UM~KN}HS!6%l-Vmf4Mzf-Ho2I5)mMx=C1wWL0R z)IS!!98+1i=j+uC)-L0Lp^ViWj&c_z?KA2)<`KD=&5PP(#jgQ* zs6I$7W3`$?V5xmZl`X%KThl#>BAOu{f)xHWR)xuG_T6@yFC=X-EcSp!H{9Jrl>c~6 zgLmI2>%Z$ym`!>ZhF^0OZp5tY4a1Uny=4M|T{4UnHO>&jO^#@FS~<7|V%LdIMdYev2dw^2RlkrKtFGNaYq} z|2FFV82@g60t7P+*MpA=e#FlINk37ldrj-c-6f8)k;tF0BE$5-B(Nl@Ecd?OfNh+b9}E?gvl`!5o}%>QA9Quf zT@q)FMR6|+mH}w6J0O^i*unDndD@LIY@Y!U%r9rPV2=X9-6ZSx3lIn{;yGiZI_=!W|RaUcX znYSoi(}|IMJ&Ei(hR7^tr!*LSELCMO=fy8@6Ifb<<@z=D>o8c&!l7`Is_m8EhIkK)~fhxKMiElO;Qp@P+Y>ne&;jFs5uuq7Rk z6pV$WIkNAIc5wi$ZxU)hnNOI!1sZE%w5rT1tYGz}eSJmdu9axNx(Q9aH-RXFS|G^2 z0=M}f#J>W*?BFwwZvjiG5Awuc!`#0-6n)HEjE!qbvpY>9+$%d6=7%63#NgWi^(LaJ zZdJ372M~s9toBCiFzhf*#TTD8rAIv{5{|Y8WSAgzKY)cqRw34tx1x_NLny`rqD$X4 zOjN9FM7Ie|s1q}hFmGBzV?(j_96*qFsE`L2x5Aen3d~2-Ypniw3x81xZze99$kM7L zusm-gKHKz(z-1l44)d+rVd{G(VpaH5?Ut3S=9|NSoV6A6hgwj}BSAQeIf&7ZhNCGR zge-R=VjFmyuM)8*Z&DZdN>=-Dg%ITj2XW09JksP3V1?VJLZ((F zVpVWgw~IF)v^&f@= ziqeP{V%af0F7KujWl38xXBi&Vx*miq&8u}1?qF9RMYHZHwXNI)ZxtH>L@zz zn1DjgRJ)~F$>rYL0cm8+@Vx;Rozyo#R4&XZ?Ip#eX4YrU^ z*u5Qj8CGb%Sh^1=*q4tYrLXN@LWEN#$O7+dB%^XGpy|N8fH|Mk2EuXA28TJxX&{r7*p&8)v^VWGZmGO7Q}YSv%<{J-D# zT@8F!&#ogUFX%eJ!*gI)5AUG^y_mOe-u*BCMNXf*;NO7%_;1AG$qN=u|C`S&m_B*V zjA`Nj5v=~v{DG18z zkJVjV>Q3cXKas}NO@jZf=Yo>(h$t1m8JSf!5Vl1Sw*9N%051|!GW2u?AsDx~bdKseUYm*cv_if&@KCzZW#5XNJhux{)_L4Ng|FifLSspAJJiKqwbC1pE0-$9txLd*kC9%nxOc-&C% z&B9ksapALAU1^Y!Dus_o_oz%^_ji_r*MWmh9lwq2{@&%Qqd8$2ve=(`x?|Fo2?#XwbPn?M7icVnsld7;No>e-qnUS z^?3UjTKKN$|O z8vW@{s8V4?6SrIu&M@z=2X#R8-#{+9p^>9_r6QGI{hblLzsF!E%pq1AQXC4ou%^Qj zuL!P7-w8}+M_RMzEy&*;+O-(5B=MU^n$SDl%yGerE|E;7OpXn$9`cJ2AA*e4ccC2j z2dwn^mz4y5*n1PKOhvz#U`6j;@)=dq?Pw>%4WWp&BC>NgIQAY^T;P8^>av!98z-pb z@xOK4kj-k&ESZP8w5G$Ho(Ru+HYZX>cbeZtV^S(y@y%Pou}Hq?y@2X}1A+a%D4W&3 zHO{9@pElGx@1amsV@tT^J&+DS0{fy9x53InJ|vN#E|z%RozG@p721p zn`Tdx&b_E4=}a8w3c1^-Lf#FefT!}Oo@e@Iv-;8J7C@BR(v`~_g!8xB5>D3#gwdI# zI#;ka4N0EE&nl!)n9itv0)Lp*)I3;3l`-w;U8YXRaP3N@*uJo$hn0Tl#C58b!hs^B zuOpINXE@Ai@4j3_<;flBr~%i7m|91oG`PZwg-Lc80K!D55b>x8_zVbaKi|WwZt%k; zRCaNo-qE)OQ&?9bZ+1ucmL@sH3xwaOLUw-W2aZkWeXPg9iazGzGRk%8NK0qk6JFcG zu`8H2k`*#$mb+9n$=5FU!O9K{iy^)_tj6o{7Z6xS>aeIm@VwfMaCa~XQkucaP!MLX z8ot5HU!66HC#wxPsEgBTstoK($LN{_|2SvDTpb8Va}%@57az+3lf(GJJb$n{pLYpK zEXrYZ-L-2dGo(8$w`>$bdUqwtKre8t1#YlRP|FMz;uA0eso0E+^@0`s_$`3E>`seT zJ{B_jIbnhgLin~OE>{G{P9T;EeB#Lwkd94|z5|ADS&fnwO{EJx={;A5j2+d5NJlXV zxK@DpgYYL+$Y7TMWUm^cWMBL(tIgKMQ000r+8Fpw@cE$=k-i)PNE<*#fQlDYNYJ4G zWb81wwgy&ombEdIJJ6e2x7U(k_W&6+9FVpkOaMZ@3I8QNXygY-_(3TDB|!A;hsID@ z<3daOHVJY;dm?4~AY<)8m{Ax>&s50xYaf7P6@0eeF2fP*FnU;N4qh#36sq&>Axgeb z%Z_kt6hgiVNGzWz1q!F0pHt_{5mp;y0;H-h-R<>ASZHTQn3qOa>4YQ)fh5(4C4~=a zF%njyVa1t0!s=Qth^2C{8?_32Dr|hzf=EdO5J$u^2L7Lhl??uP`AF!%9zMTI3>?!t zSRj^buJk2$S4ivIg2=xMfH2E=3WP|IS3pT@_`{s5?)V$qdvRr06(^f4?t8vSp*E=zBuDf7K-PANgF6ewhR-V?fId_~slybjHw$;}Cup8hNts zF;*M*J_(DWAH8MWDAbRADkx)S0W#hsU)zX}9YHL%#`dqqA^b;>zGub9SY6tN6e=YJ z(BlpoqMZ$nxy(Sne1htK2^~9!>RTDbr17XrmAdmEgN`M9MY$7!v}cB%*tC8vFso)F zmPu%_O<->(s;@VW?l1w3e2GVA7U#2?QEk70CFFAcGbrqyX3hy!^caCfjeFw;=VZTMIw!hgy)Sxp4_&zlcc zu%g>)w+q8!3>{zJf!J(m61e6Oh-C))We2LiOSSTI`b0IZraq~F=-&<74J+elA?hN(%bk-U|ueOW8tu} z6TQC`8GFokI{XRx#Rg1{DJfvJHk0>JdFCh7vROaUH2x1kDO`+(kH02+DW3dFJ;kbRK8vnt0NmwyV1-Zm4a_&BTac>qYCDb#e+i+J|= zL*R6)kg=7Z<$lP;4+y`UKk|ne@dCNg&i)G}%{!e0v=I|Nax zQPuL)_E0$4#po3CLCJAeCr!ykBhR3XA^xP$MoX0SYY=`cS}aE$d}d7!qm$rdM`O(% z1`vH&>38Utnbe^nkk|~+VIr@El{oA$x!{-`+NzL`_<0hNtT#gW4JTMla?7KX)6K>L zA4uXR>j?Kg8df%nuh8RHR~Av zJz_a@0#j@rtvVx;jrUp*#RGc)cLWMkjeeP=LR@D|NBGT<eH+59HA4y^NhGLS?{;-3r6=1E5kwHxSE*uwr9$+B*ZWIHO;B8H-qLvQ<9iqQY@W z@LH>8>eOKfB z(KF##aB)Mhjj@>3xRey&WD!ZN{hY~-V~#|=^(A`$3?NNvmau^vAD7L9W3zbkMqbQn zpEZ1slf?>pvBra>+GFQ$NCcz|wEP21ZVr+-8j_mx7`9rIT~4m`98H)SC}MG zMj3?nJ*-p_I)LixwHH=Pxdui@LM|d$xQr%owNj0DN0MVAtx{Tvn0$qD`Hi>igB=X)}uu_AJwIS)@ch%kxTx#Akd#j}^ zWp(`vYp`7;(lDQyB&N+^K+<648kE01Y3lPUW@R1U*m*uO=7W=X46NwKPr63AvP4?; z$pVt48BDkjF*_@2(J!5Ff%hwv|0X|X&-{PYh2c-K8rvncRC>LM23(IIjMX3_`KH6M zYY4w9iFr_|LhL$(BbI~`j_YqciCD&6r*g~|>a}7H39BAPn9dn+tk%SB?F~q&>R4t* zI4C*{rSyjtoxJoHD!1KEUF@e5|KCkSdYu6)*G+PPJ0Sa2D?P;sbZB=F#sd)jjSV*_ z_hvioG=DmI@|%G$_WKaaFKFaJ1Xr8L*fl=yTm%N+2DDg$;S_fMb2q6{vYlo{%pn(B z8VT1a6IO1SBLvTL)FXX$wS%^}8ik-hTe;Q|G@*Cx*rqW1< z2(omEktmlAA(r1vxI@9caz2uLly8Vwgo(BmgsC)~#t!565I4PF)B2_*B!7j0NDB_b z%0pCNz+LP`IF`eQSuDnU*@)dz4=cLNRgF|Ry_4E*Tt>26`Vyuh2T6Wp;;IDn%RLqH z*|&?;bD8-ldYaWY9{!zjd(vo8O%!oCjf_n@0>_@1lsN<(GanQ7^W(jjpkLy!j?UxH zu$ro8PpLF?7v+lAkhoF<;bvnHa!puBOt^}+hQegQ}FCdSkRUsS4Eydv5gi?M6h_2^}7gR~xLo=n7WY#l^>yA9c@*0DW zAY(64%Kd!rR?F1m8+J=T^jqmmD$h-)JIg*N`X*Qzifw~=k8K<^s(6OF9OUydmw_;s zv5&1boMkofKfk8Z@x3&PtRzVf{D_haS+8U8RAB@$ucBrvV@E}zF7b#Zgg?t_%dfwo zl9)lQ$E_rH`}h;?BDRfHT0x2pKrBrvl~`+I3bi!gkA1d&YnRCZti84WAP)qY7${~VBHgZUz_ z3_E|9*HoUKL2Vy~lKwOROfEnvtpp})9Le_k7O`C8y;gjIVez5LEw4 zDmWGm$L{dugH}Sv0`YpG;T)@JYMR;MbSXI#ZX!%VG3sK2@WUbFVff~!{QL#0(EI-2SibK$R=1$1 zR#fs2()4qYb>tgOMT9I6WJ zyS&wqz5x&=3*T~9`+bUDREB&@XWfq`j@HA8H02axu@jh0>rfY?nz8h*Yf#FU>QkO_ zRyRM;LX>;v(uJ3z$vWGiL{?5g`dZ9!Qj5 z&!GBka3S#}dOuW!oC#WsbaVz`)&Qa%*L&9l(!eqrfoY&(~u#AEB*nHws@qL2##%6HF;gHXb{E?uNnRa zoc9mfh|;VR)IMV;X>{WV6Rsd*?F6ME5zqR}hxc55cTBWOOY>un^Q@+oMGH~deS+RH zrIDX64?JtnfOG$sSV*!|Na%4h zq#Xp-7Z82N)h$J3UQLMEX1_o}gsX%h>9^}}l&U%_fhKerRP z7p2tU*EG^}$sN_NLhpA5a}$X9yl);B{pZHQiiAft`wSJV*6VF+QR#D+c5d!1Z@)xd$1`;M1(vgD`1$Vzq=9F@D zM4s$Un1k10rK=!`F_4QiIFrT~F8UK<-e470mouP~$apAp_uSJY?~)t7`3CCZ498+n zO7jDVRNnTX`Ld{=Dt!vhFH<{<%KD46a8nt%w-t+G;B92Ao1g^7Vj(d*mc%;;#R0Mt z5A2rl7g$a1&yJ#;Qc35wuOyBGdJ{R~9x~QV;MT^fR?PPo^%-%{vDdz*o^Jv~+j)c& zs$Wg(lP-|5^VncZusU$vp~|lPx?WSAMaAkW71vxQUZH&n zGX$$c9UvvBL`THOJC>iqe{&X_?_RSz3p>Y<&zXJ#o1b2N_R5DdYI#-tmBV zV^bG&T-|*{niN?TuFz#y-0@tmraZx`dfXRIXxjYU!mW(9F@^2jyz~r%5d-7mKuXk`0<@!3>bMzw+23`3@ z6Lic4kDwEvFd68PwS42}X2>AG+wqrJjeCL{8tF>70~qRceNp{*grA3u zt>Tla5-^d&@i66);SzoxRo`Efb~ez)ptr=*WdPv}?+}Zdz}$?((p!L7B6)w$jW{F6 z0&)=$-LjS*BKN3~CWpQxjd_SAk`bjkcQyP7pOdPU(a9S@eor|05ggOIck~n)o8PH* z)Jx)3=T4+7OtqB$f;1=&u^d&ca8Ea?8?1fyJwqj{IUMaFO4=vXaqI(fKE;zTyEURD zc?e2KyjmAkWW_H5xvxG*tz@;DL$K2Dgeu#9BR6Jv5k)jZJOydVmsl01sMUAhZN8AS z!?4%`E4q;$o}&E6QyRMaF4_3aAi`|X!!i7TtYkB0Wj{EU!s{&)k>onxY`<1~C97Y1 z305w>pf=J~GV$^d!dU81T2K5IZ5xKgFL*=?QTj9ynmikjQp1m|CUP7nN?9-I(ySXK zai|Zjp)5rukV7fAq53z`???H!`x8N!>9`(zQ2Zly{tpI6@B0^FHwnpOSSuMlRJO-5vdnuXUZT!o{$17DXNt_-)+J+PR$R7 zimOGaTDzKn9DdSZ&p;!6Mh5 z5yv*&!3q~lm?O=QF+kp?Vel=6V`q8CiJOtJc=N9a0MT9jVyGzH)QG0x4MeO9Ci2(K z!LcEjFT0^I;Rf@f7+J6xb?JriS`Eka)OEPXBx^<6SI>yoypM=H(*{W%DsaLca7% zYkWl|K`;78GSnqx9AW0zin0TKo3Lpg=8Fu+a{26iTQJ3D;I^kDtmwM=@*;QGLOjx4 zOVc}iLgYypgcW$a*)I8|>-e+bdYnJJ%brDY4@GAN91SG@i(xwgSX} z-^FBM=eGfq5AxSSw}LPh<`vGTn$;}2Du~Q*E3rJCrM4p{;OU#$iV=?p_oMgS@Xh=A zx%;+ahl$1zKVDqTYI|f8QI2mWelggBR<4;qm~9xhxhB<0%^n`jy9W>J&5#DP*fv8o ztNYqb6s1^evHxL9Iyfa13rQPT2;IwKiJUC_`H#$$kR2^&rT<3bCZ}*~hlQN`)`##9zVOzc>OqW+TSMx1f1lCll_4 zJsk5#k`H3=Z326fAgbHd>f-@~;R>t078igWriJ+IqZagt*CfKxc7Oy3Qm+G8NMsda zGi5t;Yz0y=0T5l*jsQ`ywiVqcHmAmaB0k zS6R)HT3Bh{8Gh19QOWWIC2ORxeQEB<;&}K;3zr}wA@vEmDSpP9gID> zwYW9t50nmP7B@%HC3PYiIY(7XGn1=)k^!+YX8T=(6`eF7NK`J^iSAck5^`rL;R+nY zq&wp=U-Hd+G6c4SPu!i1x&)M%FIKMNK6%_okvr5%-S<^|3cN>Erg`EN9 zxsUlVNA0U*^HH=kB+=G*jn$+-A0aBgwiGX3JV`=ZM-thglUR0Oj37IdnVF0qO=*mJ zF~vM94LFMxUt_iUq!A*st)+OR?NR(vZ6#rrIEr@5#-jQc5Ze&^{;iF%F*OC!Hx6|P xEV;(&K79pRrrC*w6$PYn%^F1gh5S}Ci5|B_78j2Jvq)2J7Y|904le_eJ26HB3d!_YVxI5b*A>lS9;stmS zBo{nD9w0@U^eLzyi8J5cIxP4HnTnNWc6N8>o7tJ!eY?6s>-3n4G5Y%Xa7DyAe;AOe zsG}%V)pDgaUULd=y88CX>d|;S#ujWa&dzt9SOGYmy(;h5Yc=|Ldvkr;>KXICVWK7* zJ|;I=BgcMIs8XTlE9m%5(mgSpvj^lWIPk_m=bc7ZXPJ)MnXk|?Nt72cAc)0d_f@If zb}GC3o?NNkFXrjsE)CN(#FkGx`m4gP%g#Y?_nUJn9`9XDdoOlEWmB$P+EhPEb<{Lm z!f}AohhU%}ctCUkMxbG^PBa3e%a24~f`^ucaguj9T>!f*;8bjCAP!h%NtcVWOhvMG z$tckM<|{!ySxZCP+DPjN#Q2o!Jxbq%J(olz#oC73QtAyo9b%&$K(T$6QbAO z5A6(#ij|6P0sd2=XF|_~c7$FC?FqdU0tjw~Mnmn;R?<*}Fky>;fSJTh`5>XZlQL5) z>7~@teWpymqm26;6YF*-r&vQ#mUS`u;~mWm7*4l^Fd$C?_5+&^03MJM{~^kYpXuSh GNy#01&~dKaYE>#sC#7mmWNJ*O zYpG;tl9o%*bT5^FP4L5ZNHVwcIcEkO@6rhj<;I-wl~W`f-&Rk?(#SEYiv9iz;!PW z9l{VWcI!k3F!c2W(Hr+-X>#|#y@U`pxqIlYf`Co#l6RSBE(~%vJnn1bZ8SHy!h6j# z{nUMVR3Gl&w!T3I+8kQj18!@`XfiOM3H=}<+U7>Z_rgCGopx>R|U1PdC_^nq07k;j4 z>Q|hpdbQS2)uZD^gZ@q)|4rI`Yp^S?%M~o^H4za6p!FxBF$4i4y-suwK@T@X zfAnWBq9o0`oGox$*5O=iYA749a%qpJYfi!Skwd<$OZg=5quMT7wids2o3~?0vr-2L!p6> zvXX)#gb7;&1fOZbln+zNyBRaJ(|$&6GhlM&k{k~>AvT?EPKg0g)MGJP`i_KyO}`2- W!hplT<^#YExRRw$SvVi&N}d1$bO#*( literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata5_113.dta b/pandas/io/tests/data/stata5_113.dta new file mode 100644 index 0000000000000000000000000000000000000000..3615928d558388f77e427bd7444798eeb1654701 GIT binary patch literal 4628 zcmeHJJxmlq6n-lvl4z_f)R-`YNHj5*-Qy6C5QRey{6SH`A1rk5cJFrK{Mt!f&1WyM!s!U4R=z8+;ba2$t*=p1VGA9ieH9hxU+ zK9c>lzH0b6`)kgb`!0nBv3?$8g+Z75mpMn0Hi3 z0{0)818yiwTU!kQ=#i*lcK5{MiDYW&Jh7+z!u)@^2ocp#1IWDOT+o(q5iU9|!rG)w zbdjmZI&)R{X8Ax9V4d%+IFw2=EQM}}%4@=QiyLrtq;tf#F# z$Vwt?s2G`PM1a`PQCuA`<9%@y(-To_pp>4Dn;7dF3Js!mtYX!J{_&*V$h198YowHO zA(0yScsp3mOJTRTn7RK*S`yT$_CKx#I?A~V|(tQD8?jgoP8Z*7z8e_i5`=r~&XUck5UV%saKf~8Y7@iI0d<4GkhnrvQW7lIB zPyGhlvB{!rc+a`mPPiavMZiyF2 zj6aE7{7F3cm*_=L#z;JCLc-Al2NHp+L`_5l)|uHESXx>%CPpZev~S*f-+Mdr?Rz`B zN(l!_!8Txyg_px%4*fi@hKV6*FI)0q1NNtXT;t{fV=1ezv(jn9zR~Az#?4F!3whWg zDA4WVz0l=5;H?Hf=NA@@#cy1oUz08X7tf$MG7{I2zW^Ybh=&Vi2t^YD%8`GFiBuqp zR*`pz0$QXd=Jf+rQzCd3c?SjL-zESRB`c#HKBp^8O}{m_fE*%Nj&*yz>QWvRusHyh zfR&fdr~QL~0wk}xDu-ovs{*o(4&O2X*}hHvj~UPf6v7n>GYkW}w}0)Z#kh&tAwQOn z?bw<-HWTJ@0ya5Zt855qe^EP4W^o=ICB-=iK5W4mIjXZ8FhO?f#^Q-& zb{yMNZey-rE&`}g@Bkq5j&gw|;v!gdTm;2Q9o0p~K4UC;v==pHSKb<+Rct_9h{aYH z;sBSKGz=YaanFHja#h+ep1P!<#+s7Z>t52^{M$$|zW3PeCnmS9XsM`EcM zj7ssaCc~tp#3Gs|i7?O)+cZHFobW$S>JClo)3`@UDHj~-k&ky8%V{g@Wd}3WpKw_Q zYHy~}Jf8tb&ijdFc2}H8b|~MpjgWI|>-23R>+jk7FrYRqllPHZpRbemgX9h~?qT2& z(OKSS-6lR$)vTg$i_*aunmjpK6!O=lr(n_-fk3J6toQ`*e)9|AWA_fhK>a5z^}#MOZL(aAv%K8Ye1A- zBMO^(8sYvm0Le@`u_i-2nUOJvcTq9}(Ij4lXHjIlNThi70}~lKS%qg|0Pk%AP{WmE zB-J6mnD8WArDIbJz$&kA(}7^nTty5|S9Qo--wd8irPkq_+E%<}OU6ETz867k+a|u9 z-@EC#Fq5AM6Z}*a*dlbq+S1U`q2TKsHU+Q(;~#`M61yWmb z6aY1z(DdHEWGbD>t{lhrRGgR_SBohAok#&7_m*ivYOF=9vlfnvT-AcTREu6U zRE)l{L5I?UwNOeeE|f7LH*4D#*5an4#b*m#FV>>pFwwx^(D29&mMt`vR=d15ZjPub z0CQLik{8Y{XDqPiVWcVCe7q&n+IHgPsnhLe&UTzT-`UlD;o_ysJ)2jHIGD4Fw0M1` zRSMJw>kic)4vC;-w2;&}77wntSa=-i(sUC#|6iacn-D*^VN8<^IfhijJP6GUM4F7C zZa@q~nhI0$u$~%7!K9i_APr_!Bc&sxDlj?%JCTfJKm5;01Mzulp5%y+X(8Yo`Sc)C z$x`6}znGc*Tv*{?{x(+gH$ZkVOg;I%X&ik_0!RIRaC@;rDHqZE-35utkt`EmI{B-#FKK8$G@WgJiYzr^S z0w0+c`>+=H?5JTprKCj<&JlKAu-!XaNd7=Aag>r4m4CJ1=LV6$lEhv)ErseuVz~;B NEtiWnH$#s2{{X{R*%$x- literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata6.csv b/pandas/io/tests/data/stata6.csv new file mode 100644 index 0000000000000..27a1dc64f530b --- /dev/null +++ b/pandas/io/tests/data/stata6.csv @@ -0,0 +1,6 @@ +byte_,int_,long_,float_,double_,date_td,string_,string_1 +0,0,0,0,0,1960-01-01,"a","a" +1,1,1,1,1,3014-12-31,"ab","b" +-1,-1,-1,-1,-1,2014-12-31,"abc","c" +100,32740,-2147483647,-1.7010000002777e+38,-2.000000000000e+307,1970-01-01,"This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted","d" +-127,-32767,2147483620,1.7010000002777e+38,8.000000000000e+307,1970-01-02,"abcdefghijklmnopqrstuvwxyz","e" diff --git a/pandas/io/tests/data/stata6.dta b/pandas/io/tests/data/stata6.dta new file mode 100644 index 0000000000000000000000000000000000000000..d8a46ea3b9696ffa4939b0bf1fb697f9b8b5833d GIT binary patch literal 3048 zcmXSBVr1Z8U}b=S<RO)OF{GB7kzFtD^TF=zPw=kLG&|GqFLRhFd2GZ+|xv|Q=SX zGe}1j!p|EP8i1ipnx~CSOp5V`3#zQDWie?splTn%yst~a85}hP>Jx!{P&t4T{Qv(S zB**tT6kIMm_|XY2SC%R>j2Z|Dw9zocl6Y_@8!U=pLgXh#cSz#2I}1*n9SoOTthN4ixnX45rvGzVg+CY zs*s$KSd^Gtl3G-(qfnf$P?C{YqL5jn0906#k*biJSdp1qnyZkLnwMUZp-_-ol$%*n zlA5BBRH+bLl30?cr!dG3O=)Coc=G>$JuoclAYpN*o*^N{6__qkkn>1kQE^FWS$Rce R6*QB8(-OAIYv|^YQ~=?gqTK)h literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata6_113.dta b/pandas/io/tests/data/stata6_113.dta new file mode 100644 index 0000000000000000000000000000000000000000..2e4795b167f266cf86afd73033bc1a43ae9afc5e GIT binary patch literal 2752 zcmXS9Vr1Z8U}b=S1A8YsF#?%ZsWs&c3Wf^4iA4%V28Jez%uoeT>cf$VV08@t|NjRG zf%JgDf~Bh5UB>f5omR2Sf48Q;U{rCUh7sjN@lGJzx14FPW2~Zx7f}H%k zbi5{{<>V&<&A<&(@=KF)fUdv^QWAlVEJ?vB&rn=alnHVLP5@Pa+nWpwYD}oQRW0=l z(xE~mvr`xldW}p>is9xIGr-uWwvAxk*QG32e4hvugQaCG(hRWl&i6Ug0i^E1k4|tt zSgOo0Y9KVgM#B&@PP?<<#M!}c3Y^v%p?p{(egNe&pG*g< z{{)kVx&S8M@WTo2CcK#gs%r1!La=!pbMwK~28@Okb&W{o!3~Bo5*Zm8Aa*p^Lxeuq zJE-mPNK9g2NKH%6$jr*l$<50zK-UNJ6J`hx-LOakrWz1nU})G64U7E@ALh6vCMA!! zut-jMQV&cQ4ay(>|1&OGVj<5ELH$kdkV=JiA9OYC8KPJJT!HB#1v!rt78RG2mX%jjRzY(MI4vSl}3&VdPD7`tI;j1s#T!}4{8B2iT%}=H?lmoHk zWTVysf7Y_?{ERX=Kz$eaw=e_%_%szPpJ zMP_bku0l>~UV2G}LP2U#Ze~eIYKlTqr9yB?Vo9Q&!XP&^rIE4W$^ZZLz_6%;gvFhD ihJ+MXV7f>_&Lf3I#U-U>Zp6cc4dZ z9C!c_XOQ{^Jwz%G5S1T++QzKyH8vFvqNh@r%lwaLKh5r#bt%vXy%15qu9Zuw*$z)W|04R8&iJkx7(@COOUY1LLk4bdgCiAp4jA!y&v$g`wg| zPEXXkrT-#9pnShWj}{8-Zfuf>p(4BA7n0ORE}d=n+=2iRw%~opB;WS~-@wUy_>MF$ zHm8F4?Ll}dC_#I|Py&%+*<+Im`cwRSAZMcC9RC^)X9JGHSy!YEvYV#VyASm+{zvSW zjl_QMB3XtVFNe(A|Gpa5>-*w#2d0?EE1{mMCsVE&Q51;2;Y~!w*2LuS-Yii8v?bjz zm)~1<*;%15E_;s5L2eL^ri&tw8yebV&9o~vT3T7>2CitXZE~(r zRI5102`3}?j~AtN#XI)>n!w_7gvGZSEYfK)7c_lGR$ay`A3m;C*X1P%XX&=(^>FWq Fb_Muvm<#{_ literal 0 HcmV?d00001 diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 1aea3a67394dc..eb70cc51f786b 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -29,9 +29,18 @@ def setUp(self): self.dirpath = tm.get_data_path() self.dta1 = os.path.join(self.dirpath, 'stata1.dta') self.dta2 = os.path.join(self.dirpath, 'stata2.dta') + self.dta2_113 = os.path.join(self.dirpath, 'stata2_113.dta') + self.dta2_114 = os.path.join(self.dirpath, 'stata2_114.dta') + self.dta2_115 = os.path.join(self.dirpath, 'stata2_115.dta') self.dta3 = os.path.join(self.dirpath, 'stata3.dta') + self.dta3_113 = os.path.join(self.dirpath, 'stata3_113.dta') + self.dta3_114 = os.path.join(self.dirpath, 'stata3_114.dta') + self.dta3_115 = os.path.join(self.dirpath, 'stata3_115.dta') self.csv3 = os.path.join(self.dirpath, 'stata3.csv') self.dta4 = os.path.join(self.dirpath, 'stata4.dta') + self.dta4_113 = os.path.join(self.dirpath, 'stata4_113.dta') + self.dta4_114 = os.path.join(self.dirpath, 'stata4_114.dta') + self.dta4_115 = os.path.join(self.dirpath, 'stata4_115.dta') self.dta7 = os.path.join(self.dirpath, 'cancer.dta') self.csv7 = os.path.join(self.dirpath, 'cancer.csv') self.dta8 = os.path.join(self.dirpath, 'tbl19-3.dta') @@ -45,7 +54,13 @@ def setUp(self): self.dta4_13 = os.path.join(self.dirpath, 'stata4_v13.dta') self.csv14 = os.path.join(self.dirpath, 'stata5.csv') self.dta14 = os.path.join(self.dirpath, 'stata5.dta') - self.dta14_10 = os.path.join(self.dirpath, 'stata5_v10.dta') + self.dta14_113 = os.path.join(self.dirpath, 'stata5_113.dta') + self.dta14_114 = os.path.join(self.dirpath, 'stata5_114.dta') + self.dta14_115 = os.path.join(self.dirpath, 'stata5_115.dta') + self.csv15 = os.path.join(self.dirpath, 'stata6.csv') + self.dta15_113 = os.path.join(self.dirpath, 'stata6_113.dta') + self.dta15_114 = os.path.join(self.dirpath, 'stata6_114.dta') + self.dta15_115 = os.path.join(self.dirpath, 'stata6_115.dta') def read_dta(self, file): return read_stata(file, convert_dates=True) @@ -112,20 +127,30 @@ def test_read_dta2(self): 'monthly_date', 'quarterly_date', 'half_yearly_date', 'yearly_date'] ) + expected['yearly_date'] = expected['yearly_date'].astype('O') with warnings.catch_warnings(record=True) as w: parsed = self.read_dta(self.dta2) parsed_13 = self.read_dta(self.dta2_13) + # parsed_113 = self.read_dta(self.dta2_113) + parsed_114 = self.read_dta(self.dta2_114) # Redundant + parsed_115 = self.read_dta(self.dta2_115) np.testing.assert_equal( len(w), 1) # should get a warning for that format. # buggy test because of the NaT comparison on certain platforms - # - #tm.assert_frame_equal(parsed, expected) - #tm.assert_frame_equal(parsed_13, expected) + tm.assert_frame_equal(parsed, expected) + # Format 113 test fails since it does not support tc and tC formats + #tm.assert_frame_equal(parsed_113, expected) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_115, expected) + tm.assert_frame_equal(parsed_13, expected) def test_read_dta3(self): parsed = self.read_dta(self.dta3) + parsed_113 = self.read_dta(self.dta3_113) + parsed_114 = self.read_dta(self.dta3_114) + parsed_115 = self.read_dta(self.dta3_115) parsed_13 = self.read_dta(self.dta3_13) # match stata here @@ -135,10 +160,16 @@ def test_read_dta3(self): expected['quarter'] = expected['quarter'].astype(np.int8) tm.assert_frame_equal(parsed, expected) + tm.assert_frame_equal(parsed, parsed_113) + tm.assert_frame_equal(parsed, parsed_114) + tm.assert_frame_equal(parsed, parsed_115) tm.assert_frame_equal(parsed_13, expected) def test_read_dta4(self): parsed = self.read_dta(self.dta4) + parsed_113 = self.read_dta(self.dta4_113) + parsed_114 = self.read_dta(self.dta4_114) + parsed_115 = self.read_dta(self.dta4_115) parsed_13 = self.read_dta(self.dta4_13) expected = DataFrame.from_records( [ @@ -157,6 +188,9 @@ def test_read_dta4(self): 'labeled_with_missings', 'float_labelled']) tm.assert_frame_equal(parsed, expected) + tm.assert_frame_equal(parsed_113, expected) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_115, expected) tm.assert_frame_equal(parsed_13, expected) def test_read_write_dta5(self): @@ -302,14 +336,39 @@ def test_read_write_dta13(self): def test_read_write_reread_dat14(self): parsed = self.read_dta(self.dta14) parsed.index.name = 'index' - parsed_10 = self.read_dta(self.dta14_10) - parsed_10.index.name = 'index' - tm.assert_frame_equal(parsed_10,parsed) + parsed_113 = self.read_dta(self.dta14_113) + parsed_113.index.name = 'index' + parsed_114 = self.read_dta(self.dta14_114) + parsed_114.index.name = 'index' + parsed_115 = self.read_dta(self.dta14_115) + parsed_115.index.name = 'index' + + tm.assert_frame_equal(parsed_113, parsed) + tm.assert_frame_equal(parsed_114, parsed) + tm.assert_frame_equal(parsed_115, parsed) + with tm.ensure_clean() as path: parsed.to_stata(path, {'date_td': 'tc'}, write_index=False) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), parsed) + def test_read_write_reread_dta15(self): + expected = self.read_csv(self.csv15) + expected['byte_'] = expected['byte_'].astype(np.int8) + expected['int_'] = expected['int_'].astype(np.int16) + expected['long_'] = expected['long_'].astype(np.int32) + expected['float_'] = expected['float_'].astype(np.float32) + expected['double_'] = expected['double_'].astype(np.float64) + expected['date_td'] = expected['date_td'].apply(datetime.strptime, args=('%Y-%m-%d',)) + + parsed_113 = self.read_dta(self.dta15_113) + parsed_114 = self.read_dta(self.dta15_114) + parsed_115 = self.read_dta(self.dta15_115) + + tm.assert_frame_equal(expected, parsed_114) + tm.assert_frame_equal(parsed_113, parsed_114) + tm.assert_frame_equal(parsed_114, parsed_115) + if __name__ == '__main__': From ae3464265a63a11bfaada75b28502191968ad7ef Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Sun, 2 Mar 2014 08:39:20 +0000 Subject: [PATCH 63/64] Added expected result to test Renamed Stata data files to include file format --- .../tests/data/{stata1.dta => stata1_114.dta} | Bin .../data/{stata1_v13.dta => stata1_117.dta} | Bin pandas/io/tests/data/stata2.dta | Bin 1786 -> 0 bytes pandas/io/tests/data/stata2_114.dta | Bin 1786 -> 1786 bytes .../data/{stata2_v13.dta => stata2_117.dta} | Bin pandas/io/tests/data/stata3.dta | Bin 13255 -> 0 bytes pandas/io/tests/data/stata3_114.dta | Bin 13255 -> 13255 bytes .../data/{stata3_v13.dta => stata3_117.dta} | Bin pandas/io/tests/data/stata4.dta | Bin 1713 -> 0 bytes pandas/io/tests/data/stata4_114.dta | Bin 1713 -> 1713 bytes .../data/{stata4_v13.dta => stata4_117.dta} | Bin pandas/io/tests/data/stata5.csv | 12 +-- pandas/io/tests/data/stata5.dta | Bin 4924 -> 0 bytes pandas/io/tests/data/stata5_v10.dta | Bin 4924 -> 0 bytes pandas/io/tests/data/stata6.dta | Bin 3048 -> 0 bytes pandas/io/tests/test_stata.py | 89 ++++++++++-------- 16 files changed, 54 insertions(+), 47 deletions(-) rename pandas/io/tests/data/{stata1.dta => stata1_114.dta} (100%) rename pandas/io/tests/data/{stata1_v13.dta => stata1_117.dta} (100%) delete mode 100644 pandas/io/tests/data/stata2.dta rename pandas/io/tests/data/{stata2_v13.dta => stata2_117.dta} (100%) delete mode 100644 pandas/io/tests/data/stata3.dta rename pandas/io/tests/data/{stata3_v13.dta => stata3_117.dta} (100%) delete mode 100644 pandas/io/tests/data/stata4.dta rename pandas/io/tests/data/{stata4_v13.dta => stata4_117.dta} (100%) delete mode 100644 pandas/io/tests/data/stata5.dta delete mode 100644 pandas/io/tests/data/stata5_v10.dta delete mode 100644 pandas/io/tests/data/stata6.dta diff --git a/pandas/io/tests/data/stata1.dta b/pandas/io/tests/data/stata1_114.dta similarity index 100% rename from pandas/io/tests/data/stata1.dta rename to pandas/io/tests/data/stata1_114.dta diff --git a/pandas/io/tests/data/stata1_v13.dta b/pandas/io/tests/data/stata1_117.dta similarity index 100% rename from pandas/io/tests/data/stata1_v13.dta rename to pandas/io/tests/data/stata1_117.dta diff --git a/pandas/io/tests/data/stata2.dta b/pandas/io/tests/data/stata2.dta deleted file mode 100644 index c60cf480ad5dd82db28475872f08a5280e7a80ed..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1786 zcmXS7Vr1Z8U}k`TuZEwM#2Zf-J2W&hG6IDfv<-ps3=9Vt6HsUdi{;E9@ed#O$N#Tq zVAxmxX)=h0=`ZV<+WFtmSiv{3NWsVe$S^dwGB#oO|NqaQKfizfNl7e8Ey>JHjZcPH zi$TH_BxR;!RRmIip$l1{JT)~tr!pS5y4?J{k_>z*3QH4Yn(E6GUANsF&cO~k1L zpB%DVk=Uvw$qWYRNCJeIswK{(YDi%q#R3M^l5$ctvBq-v-nO#`7|AC(^s zfzc44dkARUeA`~?#L%$t)f*t4<*vyU!DN+kiyh>U8KT(-LG-f8W(UQ-FaLYw|GRHY VAORqF0IZuJ^wW9}Oig`S4*($EbN>JU diff --git a/pandas/io/tests/data/stata2_114.dta b/pandas/io/tests/data/stata2_114.dta index df9c19ee0a9762757d775f5793760b67a2085ebb..c60cf480ad5dd82db28475872f08a5280e7a80ed 100644 GIT binary patch delta 219 zcmeyx`-``}h>4MbgMpa=0=^o4RuXSKVeHV*$jAs3YS14MbgMpa=0;GTx2%ofN{0$PxPU>Lz^0S%o0T8FfXERjrSFzRsF<8CU z|Nl9R3=HB78-OGO0}~M2#j}>l0!dX&VbUDn?Ew`WuAC}WAZvS9{@S@L)HKQ diff --git a/pandas/io/tests/data/stata2_v13.dta b/pandas/io/tests/data/stata2_117.dta similarity index 100% rename from pandas/io/tests/data/stata2_v13.dta rename to pandas/io/tests/data/stata2_117.dta diff --git a/pandas/io/tests/data/stata3.dta b/pandas/io/tests/data/stata3.dta deleted file mode 100644 index 265fbcc3a8187d355a598101472f23564415b4ab..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 13255 zcmeHtX;c(fyLI&dg@}lX11ciU2q;*hqC!`9ol_t|1I9QK#HoxkI3sTq6;K3&BSM83 zR76BYB1Az6gA)=dhajTC#3`aiMB{)H(QkL<{nom7tv9*r{=V^#t=+r#Gwf$Lr+T4Q z!`L$CnLq!8cAV4h-!K2;UzdUo8O?7*-~RLOT(Ugi!zzXm-VEf;{~u->qQ1}9OfoV9 z26r7fC9>-P4^NM-1Gu3By_u$W@Bho6$Qe@>nn8d0Ct}Hzg^OqW&1V+Qm@;?f^zgs@ z0>=D-X$$5r`pZw4KQL?l(!YFP{f3zfmM;FwPnf@9dibore4m;2cW;_u^F03=tiH2& z>a4j7r!1cFmmga)f5yDOi2}ofFZkQH8D`deY3^Tsz4?tl1NGNo_**c;^c>R9W9I+x zng90o{~qA|zh(Fb@ZTc-pa0SG|6-K>b144#-+v|J%#`OP5W5w<%8~2I(m(~nD>>i zF7v4%H9jF6_Yja4LO~bGbKD+2>WZh3bh0g<5$C| zLX^tij7+o*SQdk2Kku0}KyV4^&pX8?0HP;t`cY|6E!li%yO3r1Ops^&PENf9gs~Aq zk~V-PkH0*X6C!*z_{Kh3&1bQi{_i{~6ZA8gePgc>>2P0Q!e5a3;CiE++8p+W@sgCu zpVtf&tZUlv^}#8IELQ8&%abadZ;>T!whJ6rCvc};5=p8za-5Ce6zi%&$`1^LVJ&?* zt~*$C6T>~J?0u6k9@~U9ho0o)P8jMn%jo@S5UNy{hotL0gy75u=ImKD5zSgBb zmEvxa^S4(E8wWQCT-8f5x}d=*JG6i=t>JzJA30)>;D1{-z#qeYR^vU;gGx(pk{T{n z2zvZjVC-I#+bhfvTfsi3Js{Wl7k>-_OJ#A6pC`Yc)!IjMRIa!~zHmwsnp~dZt#`yF z}rHVVq{56J!#u-G9l_9{!y zxFPteKto8CzHzl8)Nv{BD$_*23t5erm*uZ+Z(gd+hj{Jx_hJS?%})17$8f zChueR3f`vysnXM=lfOeSt%cY}o;<_k^2g(cLT?tna;giT&Fad6j8rLlLb^v~3A?|u zB+MoY>SO&JviuWb7EJEwVaO2+jk&@wy|%(gx$e*L?Yo80w-!X+Z%LhEpBt4xdjxr~ z$}&bb97YEiz{DS5HPeF$mCn8(0XGf^xyQAHImc4#8Z%2n+$d`oOw zV;3ggf%~IqM7uSEWY-T;kYNt2*gmTSwH2H3kz$NI2ysHgu?DHCs&O(%Zs9G<{cMVX2WVpSe2OzrD z_dkVTEa=*>GGT2rQkl?}o_hHjf^mZTQ^1nW$M537ve&>cc7W)ER*j`xl?9D@aZ*Uh zWQZiTrx8AH;Y(K-7Oq0tdmB-)I^O^GPliLRMt^z|RVuA$(w3{j8Rk8jQU_%J9oj`V z6mk@=RHpH(zcZrt_ZZBAImBv1NKg7~|mcP)W0$^7P# zCe%(hvtMA*C6Q^A$+e-?Lw*qwLJ+ZsT`0%>0hV6>W=Z6SeK3J#8tTOa7QJ`r=TynC zqn!*lg<{r<$j;qh*axtr|Gafg)O8M+CXfaG2HJd%c*-Q##O518xYhwT?uo zcLj@uNp=_j!9=JK@u&#+Of=YjzK2=e;Kxg;?BYPZW9|s1u&zYj><;%WO>(Li1iw** z?EKgd8k@oUSdRmXKK9~r%5~~U%Vysf-rB;jtC%;E6(VMCcWG*nuV3&3%MJ{SA-=h+ z#_Q=9Xt0jdVR60Sd954a?qU+8HUrC02xhP9zQN01oi#}(s|~rxi_>bV4D3qB=o$t8 zcxS>~8wf~q6BF%=kL5ziVSG`(KU7`7yM!ba=d!x)+BK9J(w$aVHV7fTyAoxf7c|zw z#4sjw%S;vG6EFgy*o=tv0*ik97C>Hir^TzD3R(S}Fu?}FeOnWkCqiQ<;L8L)>EsBs zj!kHN2Mph`8YMl3N*8+4`>qTbJE{wjj$#sUtpM?d;7_WM!7c%aUNxGMeaW}1Hb)mr zl^eZiL*RSC=Z8*2`f>;$Z2%boDPB||L5BhmvBS{X8nEarYhx*Qpf|N{uO-9o12Sqj zAZ;O-0JwY$_Dg)w$d3^4gXsL10MWM}8cSu33oYy0D9DBFiIn4mh_#1cMqwnqP$A>5 ze*}$H@;QFH3`elT=)uw)x?0*GR2SHzDfyyXc7$Q0;PN#<;`l5nP&oDSoH}2Qu-YIK zAXRRK<1qjIntwF-PLY<$v! zNXY~cNBA-Z_MZkzCV#wQB>KP}zMxAi4AVPUz?bW;^fh-+NblQ%$iEAKI3fGv@UgGJ zQpg7`8-@P7%dD~DJXYhJyB;H{Kdp(Rm#U8ZNDhpCyXaPt7Al+bC7~D@$A%pjS4DswlUQA%oB_wU2Qm+9tgJ;Qj z7Yo9j_y~q|N06tXMI=LGrTpmeL2!RKUf%+X-tpN6OffGSVWA_9RvL(ZBsA6oRVg9+ z%T!3&Z$T*Jq2_Wk_`|&bbJEtPddi&~4k$VN5jZYLj ze1eGC8)F{@mwv};H}{aKbd}Q0FZ&ShnAZZgXDT2bCZ;qRv$76(N#pY$d;<43n+@Z? zV|Dt`sgz3)Y3RgmEoQaK2Qt%iCT%>PtgjvNE=_!$ejYy^v5ItP|UNvOBedXxBUv4bZCr0=;gOeE z81n`Z+r!Te`P96P8?pKTqFsGCmC6qZU1#Y?vf8~A<5~pUhC0ew3z>S{E|gu;n#Sq4+p`bFF3M; zDpyQ2d23s;t{w7H8U~hN6Zd>Qa`6ictK!Etj75by8-1+(N{_Lcke{}r?g{PwRin^< ze4r)QYkrr9(T|X?Hp*#WhUxn60-j#`q(*S-^wT^ zk4Ijr)Sdqr`dH#ulsgefduHm1P3yM;6Fm#QOh$=qf_k%%eZ6sXhY2X;OFSyGq=40o zYWoeEQV>m_+M48_dnGVW=OQm*CV9eExcoH?Yvf(mO@Lok@WqoaV6|KQ(&5W!TAkj3 zIFMHYcNePzGaaeihL2^z{pWn6)kKK@y!l`S7Ts36T^JT)==g^nh|QKpfomQCUuL3S zb|Cw^RFy!oji=wZ9b+d&+k@JPGw;10~0n7P4BKDf_5A zYZA3=){iuf|3grUmLOt_kjiunyIt_*F5lnB+y?h!nk^|VWObpFv#GrLGx|PeFo{s! z3(DL`L~JR7ya$FIN04vu^X`YJ`wY%zhCU2wdTIIHn_1W2E$)O5>>c=q{2;B?W5*eb|!Kib6)aKC~-@`oAnOikr+ zR=Y9dTPnSmsO@JaGH(aEWz=f8zZQM$5Smhr+AU9Q4~3Ckj83s1l^$nx($qW@@=V$g z;!lcfv_x6I2JXk9#B$ZaXYR>ibTW+WXsp@80HQB1`wsOoi#pT?5}N@!Oysp-iN_9; z2aVaGtcv)EpC=>8dLuf&;RLHmX?c`#x;a?j14;Z89pOI2fMp}{Qh*X`r$U^=!!VMJ zsQtdBCvX-Uc9b%=W>cTpfh0+b`C=CXmMzd&5eA<k{tfpvB0lsw}4fGR82-}Rv#yF&M7a%1ls}OX{3VvtH zX%NgTyq-`}gnjI%e7OHPZOrl{{*g9B2{XID7w(^dFHK10L%z0f8p_QA9j2zZh}Auf z$)`;8Y`U&X50bmZn#kYBf^|Q7awWE-o^ZdBFPl9b_0rYcpM8s1eNOE$%B0Vu_7?_{ z(N@ig61^Uf9Hdf#VRPY2J%97bbYy?1xsUPR!75af17XOGVj zv0iZh2SYKdYaD%ya-X0Pz35AF#*<j_mx7`9r1qxY%`bdKIPyUDoELlV@Z|O;t>0bl#qe-bZ zL%ORPt9>&UJ#8^>9!sUzEwe9C#Uq|>ELnhA=}MSiccNaZkjm!pMT>|%NIF=qqw}{X zjeUN_tgPc3Ixj%Pd~gzv1&eNsPSPdeQZw3sz4)?o~*hiODh+T(p_>x%4as7=a;mf!i zRF2(3y;ja8Vb#M3(>W7{)tb1iy#Xmx4a>?5heU@VmHuGS$;*DBa@+0H#eN3y|J_8S zx0ztMVUi2o0okvz^b{jdq1_=E4?y%cH{7J$yX~~of*Iu5ZwA8H?}IPDppXX-J(kAcA6bAmt1UVBwVK~u-rDuZ@h_9Y@Rw^*YXSD%QhIc z91yL&+g&Q1+<|Mgxn%SA0%80!AxQ9EXR2r+B zN0vMn(E0bn{W_Ck;{(Ias^ist;Udh+ICPlzfau-r9#R=wY{AM^l8jW;_Aj_5-iE`->eEH3UI~3e27a+(-`TB^( zm}qMun9GLK*kSx0s^%9SDbUuHE)l|KBPNku{C|9zE#FrTeHwS}|Ys5lg z!d0|2I!q?NwO}d4U(VzD1`w_DxtElCzKc2xS&chOf|Gb2e0hWU0`W*%6|!;MG7P>= zNag2%=z5-bMV0hDG)r1VX1}1g?#PEPZ!!1?BK8`o+|T!JwOl>EVYdWCzm>kG^1KYX zv;1E~-w2kW*fy9C*v65g$`{DXK|VigIRtYB``Bv3Syq$q^IIw%-%F#&Dw6!jk0>c< z>vasCDvTiJRn*Kfc2p$tk^o;q__M6G;>J5FiJ8=T+$wUfk3ZoqV%vz;3Q}AEd}&l6 z=bcxekA1C*|16$a)V`z4gG@SmOeCoegE3Fd+lGa}Jr5)yNx!MtPqB@H`!+c894|S` z>K|slp-kFddNXMjPOkn${uU!D&k`eO6qH;I@t@+8??%D><@Ht1XchWcAYLyroMSZ&_3tTj zWj}rPY$lnQ0Z98|M65X?HV4^Xj#Sp;nZ+vX%mG;7y#Ud=UuHzPI-7tvGw*bou)V{9@ z@oI>Mp%WmO+r{TtyRIcPu)3s>gaGIc9oPsQE!4geyg{~U*aO`Szis6EIcm=EZ zy-X*{@rUV{n`=pSA*ynEDIhJuvIetqsM=uP=dVWV8-S){;akCKzfaYR%8+mA><2N# z(Rw(Mrk;W?b^^0$9r9vSBbL#14O019each8>J|iAh;q+7y6932S!X+x$jT|SzE*gs zk^uL4HI+YaTZ2M2V(@kUzJk@aX~~N6_xUvI#{{y@au`u^&mu3aagN!Dr*HyTs`%Y6 z*T9#T22`fu2Uc@vvX!Wm9;Mc`o5_OF1BvqM8Dzf=E+oE0?T4z6GeK(+j?NIw8bGvz ze5^&La{&#RB$Kp24$D;qQfV(Rmy%E~ad7`EfAQE_Y_mb;vlt+{KF-ZWd3zyk+_r-> z4jDqYk{?FRq|3 zLzW%!5Hb<9zgP8TOq)0~SU|b^w_o|FK{F+W0FS#T8RjB>WP;MeIpZCqf zqW`=&ut<1hv(He;YQ36Ti%Or%2;Vt*Cg(os-s5+KXnDkQ8r4nw?++F>eL zU5sEaDr3&jyJg>!dHvl9cfSVN?}EITWw}djeW3yC01*IZV3@x8c3V+-dX~D2-;$k& z-H6ogI#TH*FdpkrFW;yTJVV0%Z$yJ_GAO7QtM;P&`#E}i)^{W-q%YxE>@;;wcsd#b zS?)o^GWm4t^$<)to>(pA6;?Y?bP(mT3OejVDUqkR6XxIzuyhq9F&6D29mb^dMT`H0 zm^WC3)#VQ8Br+Zf-97I#$-m@=Z@!7VIK!}5q|*EVB8|6wY`!e&r?x%?=a*@nMP>a( zTC}O0+~0~tG4Ku|)=f|X_%i3fJe`3tNj?`KC*PQ6U$wZBXp2lOU# z#C=4po4~D&Q(4US7Y{Sz(Z}BUo_e_n5N+oXPRM>WeVBZKl%K~2TZ+|z>yDlrhq?a^ z3`^t-Y&T%La70sb21M6unzN`_U8CZfE5s|bFJXpYb*KZR)Ewj_xW9>?8odE5KN-xA zJTI{Nk^Opz%J^DZKCYI0-VDCHXn@9g2(ouP*3m?j#p&J#2~64tUy2GCdv=%XwT74AQ)8T7mes+E_ei;hz^s98d=LX{L2g(1b93C z6031fbVIxNjh4NAM5;!*67B$odR<>+KLPINBVy5fa#bQGayTBQJTY9tZxtT)7p0x` zv>~X8Sh@@#oZ&ruaTAzZ@mP8b;Y%d%@3|3Y*3A0-xN|J}5ge0hWQJbvzB_I#f2dS4?t>zF| z>Yq_%+i&FNOfRB{W{9UCE&UR!!c;Z;?z_zwl6Dvtd%&U_>ES8Le>|t5yYG>W-wYzm z7Cj8ZuQ^ILV^;QqVX3^{G6_Mh^Ud*V#b0LiYcGN2(kp5sT_Y2(3?YoA4ypCT4@0(L zSp0%Vv}j77C7~zJ0i?|EBdduV$B9z*Yq~7^CP^CVgKH>DQ3>Rb%5BL0E!6u_zG;6F z1TzEIgO5sn#LoZGKv8OVLmS55A&#<<$RDvH!wkfg=XR`c*YSw9kWbca!i{1%eyh;Z z_eWM&;OQkY>F?2&;YXxin4PJE1bIR#SW;D%yWefXHcrhChDxefjo<QTp)@x;EnuNwC49xR(XX zU^Lh?2xc>Oup)keb~6myXFvptt5|K-?7%$MEhFyRLCq8M4Y8F}f2@mdYT z^wf2@$fRgR+cz(W*Zfb2Jj(__9x8Cc9%xKbeF_ zQ9Z^InZ@jshM|vTsx0Qb_y=zROFOW9S8|2biZ#BXlBgH`BN^%vGLA5FZAIAuzfIV* z5A#KaVR?MczAczyGjZF~5iGiHzP!jCwh)hW*V2p*lZZSSgRl~hH@jwo4xeqA0~#i~SE<(!r^rSV&qS`+jH_ z2hjSap!U=Gq^aAWvDQZGn!M6#R$n>5Ph{>`i|yAor1ldpEwjPA|qv1;$ zpL1**SSo#yC;lqt{>2gKV>V)ZLJOMTbqe8L*~2h@1ouC?giPm9aDIrycDgE;KN zr!eFYCUQBHoW@tw?Z8oVAY{3#u)0&Ma<@qD$ID6mqWGEzL@< z@<{>2%9!JK9V|L&K#-_huoK;{z9!`EGQt%)h{<=yW4;ua_hdBKQa)*S3i1+AYQ9*x zhWq4kBSr2|OR=}t0}^>TipZKyVuITk{3z@UATNE)k2z}Jq?nJQWg$tn#_Oym!w{AfyJ)QhR+S!uvotmHbY)hCY-nQbk_BW;i3 zmujmBv(!4MbkKr@}1jI>Yi%m9UlCi!Ju+|YI)bJ2OZ*Zyb`d`n$;QZ6X0VJNl z7Xy}exE}4uU}Ub~n^>t}WMF8ZU}U^8lZ$EcZ)P<{i_IUIRTw8LuzUsLq)n`jY@5Tl G3pD@&KOZ~* delta 283 zcmX?}emuRth>4MbkKr@}1n2@O5I$kdI1M7g$QTD?7+G?yf$$j^1ojy*Z`^Cd%Maqf5G)1;JZz8@WBTL=EIN}FSiWvn IU|FsY0BI0C9smFU diff --git a/pandas/io/tests/data/stata3_v13.dta b/pandas/io/tests/data/stata3_117.dta similarity index 100% rename from pandas/io/tests/data/stata3_v13.dta rename to pandas/io/tests/data/stata3_117.dta diff --git a/pandas/io/tests/data/stata4.dta b/pandas/io/tests/data/stata4.dta deleted file mode 100644 index c5d7de8b4229543187a83a73fe39a23be00306da..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1713 zcmd5+y>1gh5dIPfe+7!5p#W)x6p02azDOvf$z4N1!7Es27~3nY?;_vXh6=Z-sHovl z+#?#EAPr9eM9g=$C+^}ObOjPCeX~2e^S3j*lkM(c4|mWU-@bm?;m)JxHt@0BueO(7 zuHtX{#itK1f#8{WA$=1Zc8^T=_#`;OkIP>_F9%cS-t}DE7bkDvr^*25hCqN)t}bAc z*Xj7oNiH=*bpda@H@EqyH?~=3lVMgD7`Qaf%NQ8M_Hl6BIUH^mX`UWDZ>!J^+G+gz zkXVGC>pMt6n-?VaSAjoO+F-#KX2WUS#j5vmCo;ubF}~hr7Ef&4chbSDYhHpg5r)@+ z3lb2Ea1U6L;QM>vb9kd5QIh5(nQSx*L@f=j#AfE|Kung7r98__th`S~r-*J_gli1y zE3fo51#T)?2h~OkgT<%NpJL8Zo8+}*wlknbFn@P4FyZE?!xM%F~|10qlLTUi pqa2B)pt2{bwDuoe37AgrDLCo@*8^J(fF78VwO{#)hyNxezX9{t7M%b9 diff --git a/pandas/io/tests/data/stata4_114.dta b/pandas/io/tests/data/stata4_114.dta index e5d4e8ca02f045b05ad43700453d520745496fdd..c5d7de8b4229543187a83a73fe39a23be00306da 100644 GIT binary patch delta 287 zcmdnUyOFoPn2C{rm4S-^6{(Fcl&W6FvY{2jjCt8T0$S7#NJq6x>pi z6pRcEjTHmMk zRzGH-F_X(!j3)CjD^0Fs(Vx75S!8lCv)trP7I`4s2uw2q4Mbm4S-^0yKaW2%j)!oCXnLWQ+qcj4Zjzq7~Rag|e>@ikf95O(C0B_Y}xBvhE diff --git a/pandas/io/tests/data/stata4_v13.dta b/pandas/io/tests/data/stata4_117.dta similarity index 100% rename from pandas/io/tests/data/stata4_v13.dta rename to pandas/io/tests/data/stata4_117.dta diff --git a/pandas/io/tests/data/stata5.csv b/pandas/io/tests/data/stata5.csv index cc597582472e3..218380c99593c 100644 --- a/pandas/io/tests/data/stata5.csv +++ b/pandas/io/tests/data/stata5.csv @@ -2,12 +2,12 @@ byte_,int_,long_,float_,double_,date_td,string_,string_1 0,0,0,0,0,,"a","a" 1,1,1,1,1,,"ab","b" -1,-1,-1,-1,-1,,"abc","c" -100,32740,-2.15e+09,-1.70e+38,-2.0e+307,01jan1970,"abcdefghijklmnop","d" --127,-32767,2.15e+09,1.70e+38,8.0e+307,02jan1970,"abcdefghijklmnopqrstuvwxyz","e" -,0,,,,01jan2014,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","f" -0,,,,,01jan2114,"1234567890","1" -,,0,,,31dec2014,"This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted","2" -.a,.a,.a,.a,.a,29feb2012,"!","A" +100,32740,-2147483647,-1.70100000027769e+38,-2.0000000000000e+307,1970-01-01,"abcdefghijklmnop","d" +-127,-32767,2147483620,1.70100000027769e+38,8.0000000000000e+307,1970-01-02,"abcdefghijklmnopqrstuvwxyz","e" +,0,,,,2014-01-01,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","f" +0,,,,,2114-01-01,"1234567890","1" +,,0,,,2014-12-31,"This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted","2" +.a,.a,.a,.a,.a,2012-02-29,"!","A" .z,.z,.z,.z,.z,,"&","Z" ,,,0,,,"1.23","!" ,,,,0,,"10jan1970","." diff --git a/pandas/io/tests/data/stata5.dta b/pandas/io/tests/data/stata5.dta deleted file mode 100644 index 4ee2ca902e757a48a0da219ebc6b70b13e1ff0d7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4924 zcmeHK%TE(Q7@q|bIT%kK)C)rhL=)3=mm-!Bg(3ytDBuH0)@^s&U1+yucX(;ymK!9- zN8-sx;=$L!Kfr^;vnD1UJ$N7yxI)w@iO4!LI}w0xR!uYSd2hXYHk$4kCmUR?&?<|F@ZE0ghg-C4O+uQ*j~aRlQYvLBcw7p zypjzWzKZ`v)30T|`2B`rlF z_NLJI9O|O>)@JaH4R&|&+qS<8!2f`Mj%NTdEDoBZU>mu#ijwQP#J2NGUn}vOU7n5+ z{-n0klMlb6@)USWYh7S6VB=<)T=^zV{XcU%35to^3+&P)x{~W54wz;>4bym44|A17 z=A2&%{1PU%wWJ6Fo1<{Vvn>(sC$Tx`)L_2V7Z?#js_g zkww+&;%%|^qaE?iu4BhfoIG{FTU%ptRz`U~#iMX4duc(CbEGzCycjGl|E9~bNGn1bRYZOf1#W;Th^&^RkkmjFLBgWuHx~Td bAP_iWcu+CLXj7O-KH|0IQ_*fY?2*tf*7&hK diff --git a/pandas/io/tests/data/stata5_v10.dta b/pandas/io/tests/data/stata5_v10.dta deleted file mode 100644 index 76de9a40090a7b43e6e4cdeba156bc8cb3e36bbc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4924 zcmeHKO-vI(6n+c-a6r9yP-DUnQi2I-+ERXMh(dvaii+~1;z76F?GN;ic99>6TcU}O z7{%zRYU07agEtdmqQryIM2(4Fj0X-R0#_R~5hJqB%+5kv+Um)emX~ziyf@!_JM(4U z%run8K>-wkDvdwRb6eg-VGQeS7oJ1i&THJ0ktjw0JW>>epr$WIDOkdo5Ayr|*76O| zRlc6)C>yTaM1CAbOqgnrO|&DE!DvRts@-M-{9OFCq%3{o>_d`x3JgY?CspI|OL#m3 zz#H(pGG=gk13b==UW7ou-HT_DRsi@s1;>OV6u zI5f)7!V z-Py~96XxVqTs%J6Y&2DvE32w&EC$FKE{x(F+YfR3B6ic(;r4`)`u_#F`7pu{Zsc(D zA>JX0p>R102ax3ECFBVq43yj=^6`TnU%wA|MZZgOqo5e_c_c{`kbMZXOT5JE(SM%Q z%^1_&-%sb^ukpK9$=zLGyh@2!pV zSE+8eQ8GXPb%SpD9+~)jiM|hlTh+MR_B+(p@;>NM@sXllnpeXk_MNfI;}lN^;~EWL z`(4eit)uL|iYIoxtjKs$GQ4A4?7&>$v!j&d$t5njagDI^f~{WV!l*Y@5JxU?QM|&1 ec5V;~WTT;n_ZuzM24cmDwzuM3w3`hzqW=vqeAQ3@ diff --git a/pandas/io/tests/data/stata6.dta b/pandas/io/tests/data/stata6.dta deleted file mode 100644 index d8a46ea3b9696ffa4939b0bf1fb697f9b8b5833d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3048 zcmXSBVr1Z8U}b=S<RO)OF{GB7kzFtD^TF=zPw=kLG&|GqFLRhFd2GZ+|xv|Q=SX zGe}1j!p|EP8i1ipnx~CSOp5V`3#zQDWie?splTn%yst~a85}hP>Jx!{P&t4T{Qv(S zB**tT6kIMm_|XY2SC%R>j2Z|Dw9zocl6Y_@8!U=pLgXh#cSz#2I}1*n9SoOTthN4ixnX45rvGzVg+CY zs*s$KSd^Gtl3G-(qfnf$P?C{YqL5jn0906#k*biJSdp1qnyZkLnwMUZp-_-ol$%*n zlA5BBRH+bLl30?cr!dG3O=)Coc=G>$JuoclAYpN*o*^N{6__qkkn>1kQE^FWS$Rce R6*QB8(-OAIYv|^YQ~=?gqTK)h diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index eb70cc51f786b..9a8d0c4a9a5a5 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -27,36 +27,42 @@ def setUp(self): # Unit test datasets for dta7 - dta9 (old stata formats 104, 105 and 107) can be downloaded from: # http://stata-press.com/data/glmext.html self.dirpath = tm.get_data_path() - self.dta1 = os.path.join(self.dirpath, 'stata1.dta') - self.dta2 = os.path.join(self.dirpath, 'stata2.dta') + self.dta1_114 = os.path.join(self.dirpath, 'stata1_114.dta') + self.dta1_117 = os.path.join(self.dirpath, 'stata1_117.dta') + self.dta2_113 = os.path.join(self.dirpath, 'stata2_113.dta') self.dta2_114 = os.path.join(self.dirpath, 'stata2_114.dta') self.dta2_115 = os.path.join(self.dirpath, 'stata2_115.dta') - self.dta3 = os.path.join(self.dirpath, 'stata3.dta') + self.dta2_117 = os.path.join(self.dirpath, 'stata2_117.dta') + self.dta3_113 = os.path.join(self.dirpath, 'stata3_113.dta') self.dta3_114 = os.path.join(self.dirpath, 'stata3_114.dta') self.dta3_115 = os.path.join(self.dirpath, 'stata3_115.dta') + self.dta3_117 = os.path.join(self.dirpath, 'stata3_117.dta') self.csv3 = os.path.join(self.dirpath, 'stata3.csv') - self.dta4 = os.path.join(self.dirpath, 'stata4.dta') + self.dta4_113 = os.path.join(self.dirpath, 'stata4_113.dta') self.dta4_114 = os.path.join(self.dirpath, 'stata4_114.dta') self.dta4_115 = os.path.join(self.dirpath, 'stata4_115.dta') + self.dta4_117 = os.path.join(self.dirpath, 'stata4_117.dta') + self.dta7 = os.path.join(self.dirpath, 'cancer.dta') self.csv7 = os.path.join(self.dirpath, 'cancer.csv') + self.dta8 = os.path.join(self.dirpath, 'tbl19-3.dta') + self.csv8 = os.path.join(self.dirpath, 'tbl19-3.csv') + self.dta9 = os.path.join(self.dirpath, 'lbw.dta') self.csv9 = os.path.join(self.dirpath, 'lbw.csv') + self.dta_encoding = os.path.join(self.dirpath, 'stata1_encoding.dta') - self.dta1_13 = os.path.join(self.dirpath, 'stata1_v13.dta') - self.dta2_13 = os.path.join(self.dirpath, 'stata2_v13.dta') - self.dta3_13 = os.path.join(self.dirpath, 'stata3_v13.dta') - self.dta4_13 = os.path.join(self.dirpath, 'stata4_v13.dta') + self.csv14 = os.path.join(self.dirpath, 'stata5.csv') - self.dta14 = os.path.join(self.dirpath, 'stata5.dta') self.dta14_113 = os.path.join(self.dirpath, 'stata5_113.dta') self.dta14_114 = os.path.join(self.dirpath, 'stata5_114.dta') self.dta14_115 = os.path.join(self.dirpath, 'stata5_115.dta') + self.csv15 = os.path.join(self.dirpath, 'stata6.csv') self.dta15_113 = os.path.join(self.dirpath, 'stata6_113.dta') self.dta15_114 = os.path.join(self.dirpath, 'stata6_114.dta') @@ -69,10 +75,10 @@ def read_csv(self, file): return read_csv(file, parse_dates=True) def test_read_dta1(self): - reader = StataReader(self.dta1) - parsed = reader.data() - reader_13 = StataReader(self.dta1_13) - parsed_13 = reader_13.data() + reader_114 = StataReader(self.dta1_114) + parsed_114 = reader_114.data() + reader_117 = StataReader(self.dta1_117) + parsed_117 = reader_117.data() # Pandas uses np.nan as missing value. # Thus, all columns will be of type float, regardless of their name. expected = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], @@ -83,8 +89,8 @@ def test_read_dta1(self): # the casting doesn't fail so need to match stata here expected['float_miss'] = expected['float_miss'].astype(np.float32) - tm.assert_frame_equal(parsed, expected) - tm.assert_frame_equal(parsed_13, expected) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_117, expected) def test_read_dta2(self): if LooseVersion(sys.version) < '2.7': @@ -130,28 +136,27 @@ def test_read_dta2(self): expected['yearly_date'] = expected['yearly_date'].astype('O') with warnings.catch_warnings(record=True) as w: - parsed = self.read_dta(self.dta2) - parsed_13 = self.read_dta(self.dta2_13) - # parsed_113 = self.read_dta(self.dta2_113) - parsed_114 = self.read_dta(self.dta2_114) # Redundant + parsed_114 = self.read_dta(self.dta2_114) parsed_115 = self.read_dta(self.dta2_115) + parsed_117 = self.read_dta(self.dta2_117) + # 113 is buggy due ot limits date format support in Stata + # parsed_113 = self.read_dta(self.dta2_113) + np.testing.assert_equal( len(w), 1) # should get a warning for that format. # buggy test because of the NaT comparison on certain platforms - tm.assert_frame_equal(parsed, expected) # Format 113 test fails since it does not support tc and tC formats - #tm.assert_frame_equal(parsed_113, expected) + # tm.assert_frame_equal(parsed_113, expected) tm.assert_frame_equal(parsed_114, expected) tm.assert_frame_equal(parsed_115, expected) - tm.assert_frame_equal(parsed_13, expected) + tm.assert_frame_equal(parsed_117, expected) def test_read_dta3(self): - parsed = self.read_dta(self.dta3) parsed_113 = self.read_dta(self.dta3_113) parsed_114 = self.read_dta(self.dta3_114) parsed_115 = self.read_dta(self.dta3_115) - parsed_13 = self.read_dta(self.dta3_13) + parsed_117 = self.read_dta(self.dta3_117) # match stata here expected = self.read_csv(self.csv3) @@ -159,18 +164,17 @@ def test_read_dta3(self): expected['year'] = expected['year'].astype(np.int16) expected['quarter'] = expected['quarter'].astype(np.int8) - tm.assert_frame_equal(parsed, expected) - tm.assert_frame_equal(parsed, parsed_113) - tm.assert_frame_equal(parsed, parsed_114) - tm.assert_frame_equal(parsed, parsed_115) - tm.assert_frame_equal(parsed_13, expected) + tm.assert_frame_equal(parsed_113, expected) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_115, expected) + tm.assert_frame_equal(parsed_117, expected) def test_read_dta4(self): - parsed = self.read_dta(self.dta4) parsed_113 = self.read_dta(self.dta4_113) parsed_114 = self.read_dta(self.dta4_114) parsed_115 = self.read_dta(self.dta4_115) - parsed_13 = self.read_dta(self.dta4_13) + parsed_117 = self.read_dta(self.dta4_117) + expected = DataFrame.from_records( [ ["one", "ten", "one", "one", "one"], @@ -187,11 +191,10 @@ def test_read_dta4(self): columns=['fully_labeled', 'fully_labeled2', 'incompletely_labeled', 'labeled_with_missings', 'float_labelled']) - tm.assert_frame_equal(parsed, expected) tm.assert_frame_equal(parsed_113, expected) tm.assert_frame_equal(parsed_114, expected) tm.assert_frame_equal(parsed_115, expected) - tm.assert_frame_equal(parsed_13, expected) + tm.assert_frame_equal(parsed_117, expected) def test_read_write_dta5(self): # skip_if_not_little_endian() @@ -333,9 +336,14 @@ def test_read_write_dta13(self): tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) - def test_read_write_reread_dat14(self): - parsed = self.read_dta(self.dta14) - parsed.index.name = 'index' + def test_read_write_reread_dta14(self): + expected = self.read_csv(self.csv14) + cols = ['byte_','int_', 'long_','float_','double_'] + for col in cols: + expected[col] = expected[col].convert_objects(convert_numeric=True) + expected['float_'] = expected['float_'].astype(np.float32) + expected['date_td'] = pd.to_datetime(expected['date_td'],coerce=True) + parsed_113 = self.read_dta(self.dta14_113) parsed_113.index.name = 'index' parsed_114 = self.read_dta(self.dta14_114) @@ -343,14 +351,13 @@ def test_read_write_reread_dat14(self): parsed_115 = self.read_dta(self.dta14_115) parsed_115.index.name = 'index' - tm.assert_frame_equal(parsed_113, parsed) - tm.assert_frame_equal(parsed_114, parsed) - tm.assert_frame_equal(parsed_115, parsed) + tm.assert_frame_equal(parsed_114, parsed_113) + tm.assert_frame_equal(parsed_114, parsed_115) with tm.ensure_clean() as path: - parsed.to_stata(path, {'date_td': 'tc'}, write_index=False) + parsed_114.to_stata(path, {'date_td': 'td'}, write_index=False) written_and_read_again = self.read_dta(path) - tm.assert_frame_equal(written_and_read_again.set_index('index'), parsed) + tm.assert_frame_equal(written_and_read_again.set_index('index'), parsed_114) def test_read_write_reread_dta15(self): expected = self.read_csv(self.csv15) From 27b52782b5ec5c5c0f7317e04de73985252e49dc Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Sun, 2 Mar 2014 11:37:28 +0000 Subject: [PATCH 64/64] Added changes and enhancements to documentation Final PEP8 fixes --- doc/source/release.rst | 4 ++++ doc/source/v0.14.0.txt | 3 +++ pandas/io/tests/test_stata.py | 6 ++++-- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 08bfcbe42ad5b..dd72a1445b816 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -194,6 +194,10 @@ Bug Fixes - Bug in ``read_html`` tests where redirected invalid URLs would make one test fail (:issue:`6445`). - Bug in multi-axis indexing using ``.loc`` on non-unique indices (:issue:`6504`) +- Bug in ``pd.read_stata`` which would use the wrong data types and missing values (:issue:`6327`) +- Bug in ``DataFrame.to_stata`` that lead to data loss in certain cases (:issue:`6335`) +- Bug in ``DataFrame.to_stata`` which exported using he wrong data types and missing values (:issue:`6335`) + pandas 0.13.1 ------------- diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 4432e9e891e7d..462351860c15b 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -248,6 +248,9 @@ Enhancements using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`) - Added a ``to_julian_date`` function to ``TimeStamp`` and ``DatetimeIndex`` to convert to the Julian Date used primarily in astronomy. (:issue:`4041`) +- ``DataFrame.to_stata`` will now check data for compatibility with Stata data types + and will upcast when needed. When it isn't possibly to losslessly upcast, a warning + is raised (:issue:`6327`) Performance ~~~~~~~~~~~ diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 02f86c8203df3..f3cc43e6e3fb0 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -40,6 +40,7 @@ def setUp(self): self.dta3_115 = os.path.join(self.dirpath, 'stata3_115.dta') self.dta3_117 = os.path.join(self.dirpath, 'stata3_117.dta') self.csv3 = os.path.join(self.dirpath, 'stata3.csv') + self.dta4_113 = os.path.join(self.dirpath, 'stata4_113.dta') self.dta4_114 = os.path.join(self.dirpath, 'stata4_114.dta') self.dta4_115 = os.path.join(self.dirpath, 'stata4_115.dta') @@ -56,6 +57,7 @@ def setUp(self): self.csv9 = os.path.join(self.dirpath, 'lbw.csv') self.dta_encoding = os.path.join(self.dirpath, 'stata1_encoding.dta') + self.csv14 = os.path.join(self.dirpath, 'stata5.csv') self.dta14_113 = os.path.join(self.dirpath, 'stata5_113.dta') self.dta14_114 = os.path.join(self.dirpath, 'stata5_114.dta') @@ -336,11 +338,11 @@ def test_read_write_dta13(self): def test_read_write_reread_dta14(self): expected = self.read_csv(self.csv14) - cols = ['byte_','int_', 'long_','float_','double_'] + cols = ['byte_', 'int_', 'long_', 'float_', 'double_'] for col in cols: expected[col] = expected[col].convert_objects(convert_numeric=True) expected['float_'] = expected['float_'].astype(np.float32) - expected['date_td'] = pd.to_datetime(expected['date_td'],coerce=True) + expected['date_td'] = pd.to_datetime(expected['date_td'], coerce=True) parsed_113 = self.read_dta(self.dta14_113) parsed_113.index.name = 'index'