rosnfeld
diff --git a/‎doc/source/release.rst
+4 b/‎doc/source/release.rst
+4
diff --git a/‎doc/source/v0.14.0.txt
+3 b/‎doc/source/v0.14.0.txt
+3
diff --git a/‎pandas/io/stata.py
+122-29 b/‎pandas/io/stata.py
+122-29
diff --git a/‎pandas/io/tests/data/stata1.dta renamed to ‎pandas/io/tests/data/stata1_114.dta b/‎pandas/io/tests/data/stata1.dta renamed to ‎pandas/io/tests/data/stata1_114.dta
diff --git a/‎pandas/io/tests/data/stata1_v13.dta renamed to ‎pandas/io/tests/data/stata1_117.dta b/‎pandas/io/tests/data/stata1_v13.dta renamed to ‎pandas/io/tests/data/stata1_117.dta
diff --git a/‎pandas/io/tests/data/stata2_113.dta
1.46 KB b/‎pandas/io/tests/data/stata2_113.dta
1.46 KB
diff --git a/‎pandas/io/tests/data/stata2.dta renamed to ‎pandas/io/tests/data/stata2_114.dta b/‎pandas/io/tests/data/stata2.dta renamed to ‎pandas/io/tests/data/stata2_114.dta
diff --git a/‎pandas/io/tests/data/stata2_115.dta
1.74 KB b/‎pandas/io/tests/data/stata2_115.dta
1.74 KB
diff --git a/‎pandas/io/tests/data/stata2_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats
1.74 KB b/‎pandas/io/tests/data/stata2_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats
1.74 KB
diff --git a/‎pandas/io/tests/data/stata2_v13.dta renamed to ‎pandas/io/tests/data/stata2_117.dta b/‎pandas/io/tests/data/stata2_v13.dta renamed to ‎pandas/io/tests/data/stata2_117.dta
diff --git a/‎pandas/io/tests/data/stata3_113.dta
12.4 KB b/‎pandas/io/tests/data/stata3_113.dta
12.4 KB
diff --git a/‎pandas/io/tests/data/stata3.dta renamed to ‎pandas/io/tests/data/stata3_114.dta b/‎pandas/io/tests/data/stata3.dta renamed to ‎pandas/io/tests/data/stata3_114.dta
diff --git a/‎pandas/io/tests/data/stata3_115.dta
12.9 KB b/‎pandas/io/tests/data/stata3_115.dta
12.9 KB
diff --git a/‎pandas/io/tests/data/stata3_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats
12.9 KB b/‎pandas/io/tests/data/stata3_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats
12.9 KB
diff --git a/‎pandas/io/tests/data/stata3_v13.dta renamed to ‎pandas/io/tests/data/stata3_117.dta b/‎pandas/io/tests/data/stata3_v13.dta renamed to ‎pandas/io/tests/data/stata3_117.dta
diff --git a/‎pandas/io/tests/data/stata4_113.dta
1.49 KB b/‎pandas/io/tests/data/stata4_113.dta
1.49 KB
diff --git a/‎pandas/io/tests/data/stata4.dta renamed to ‎pandas/io/tests/data/stata4_114.dta b/‎pandas/io/tests/data/stata4.dta renamed to ‎pandas/io/tests/data/stata4_114.dta
diff --git a/‎pandas/io/tests/data/stata4_115.dta
1.67 KB b/‎pandas/io/tests/data/stata4_115.dta
1.67 KB
diff --git a/‎pandas/io/tests/data/stata4_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats
1.67 KB b/‎pandas/io/tests/data/stata4_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats
1.67 KB
diff --git a/‎pandas/io/tests/data/stata4_v13.dta renamed to ‎pandas/io/tests/data/stata4_117.dta b/‎pandas/io/tests/data/stata4_v13.dta renamed to ‎pandas/io/tests/data/stata4_117.dta
diff --git a/‎pandas/io/tests/data/stata5.csv
+19 b/‎pandas/io/tests/data/stata5.csv
+19
diff --git a/‎pandas/io/tests/data/stata5_113.dta
4.52 KB b/‎pandas/io/tests/data/stata5_113.dta
4.52 KB
diff --git a/‎pandas/io/tests/data/stata5_114.dta
4.81 KB b/‎pandas/io/tests/data/stata5_114.dta
4.81 KB
diff --git a/‎pandas/io/tests/data/stata5_115.dta
4.81 KB b/‎pandas/io/tests/data/stata5_115.dta
4.81 KB
diff --git a/‎pandas/io/tests/data/stata6.csv
+6 b/‎pandas/io/tests/data/stata6.csv
+6
diff --git a/‎pandas/io/tests/data/stata6_113.dta
2.69 KB b/‎pandas/io/tests/data/stata6_113.dta
2.69 KB
diff --git a/‎pandas/io/tests/data/stata6_114.dta
2.98 KB b/‎pandas/io/tests/data/stata6_114.dta
2.98 KB
diff --git a/‎pandas/io/tests/data/stata6_115.dta
2.98 KB b/‎pandas/io/tests/data/stata6_115.dta
2.98 KB
@@ -201,6 +201,10 @@ Bug Fixes
 - ``.names`` attribute of MultiIndexes passed to ``set_index`` are now preserved (:issue:`6459`).
 - Bug in setitem with a duplicate index and an alignable rhs (:issue:`6541`)
 - Bug in setitem with loc on mixed integer Indexes (:issue:`6546`)
+- Bug in ``pd.read_stata`` which would use the wrong data types and missing values (:issue:`6327`)
+- Bug in ``DataFrame.to_stata`` that lead to data loss in certain cases, and could exported using the
+  wrong data types and missing values (:issue:`6335`)
+
 
 pandas 0.13.1
 -------------
 
@@ -291,6 +291,9 @@ Enhancements
   using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`)
 - Added a ``to_julian_date`` function to ``TimeStamp`` and ``DatetimeIndex``
   to convert to the Julian Date used primarily in astronomy. (:issue:`4041`)
+- ``DataFrame.to_stata`` will now check data for compatibility with Stata data types
+  and will upcast when needed.  When it isn't possibly to losslessly upcast, a warning
+  is raised (:issue:`6327`)
 
 Performance
 ~~~~~~~~~~~
 
@@ -23,7 +23,7 @@
 from pandas.compat import long, lrange, lmap, lzip
 from pandas import isnull
 from pandas.io.common import get_filepath_or_buffer
-
+from pandas.tslib import NaT
 
 def read_stata(filepath_or_buffer, convert_dates=True,
                convert_categoricals=True, encoding=None, index=None):
@@ -48,7 +48,7 @@ def read_stata(filepath_or_buffer, convert_dates=True,
 
     return reader.data(convert_dates, convert_categoricals, index)
 
-_date_formats = ["%tc", "%tC", "%td", "%tw", "%tm", "%tq", "%th", "%ty"]
+_date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"]
 
 
 def _stata_elapsed_date_to_datetime(date, fmt):
@@ -97,6 +97,7 @@ def _stata_elapsed_date_to_datetime(date, fmt):
     # numpy types and numpy datetime isn't mature enough / we can't rely on
     # pandas version > 0.7.1
     #TODO: IIRC relative delta doesn't play well with np.datetime?
+    #TODO: When pandas supports more than datetime64[ns], this should be improved to use correct range, e.g. datetime[Y] for yearly
     if np.isnan(date):
         return np.datetime64('nat')
 
@@ -109,7 +110,7 @@ def _stata_elapsed_date_to_datetime(date, fmt):
         from warnings import warn
         warn("Encountered %tC format. Leaving in Stata Internal Format.")
         return date
-    elif fmt in ["%td", "td"]:
+    elif fmt in ["%td", "td", "%d", "d"]:
         return stata_epoch + datetime.timedelta(int(date))
     elif fmt in ["%tw", "tw"]:  # does not count leap days - 7 days is a week
         year = datetime.datetime(stata_epoch.year + date // 52, 1, 1)
@@ -150,6 +151,11 @@ def _datetime_to_stata_elapsed(date, fmt):
     if not isinstance(date, datetime.datetime):
         raise ValueError("date should be datetime.datetime format")
     stata_epoch = datetime.datetime(1960, 1, 1)
+    # Handle NaTs
+    if date is NaT:
+        # Missing value for dates ('.'), assumed always double
+        # TODO: Should be moved so a const somewhere, and consolidated
+        return struct.unpack('<d', b'\x00\x00\x00\x00\x00\x00\xe0\x7f')[0]
     if fmt in ["%tc", "tc"]:
         delta = date - stata_epoch
         return (delta.days * 86400000 + delta.seconds*1000 +
@@ -175,6 +181,62 @@ def _datetime_to_stata_elapsed(date, fmt):
         raise ValueError("fmt %s not understood" % fmt)
 
 
+class PossiblePrecisionLoss(Warning):
+    pass
+
+
+precision_loss_doc = """
+Column converted from %s to %s, and some data are outside of the lossless
+conversion range. This may result in a loss of precision in the saved data.
+"""
+
+
+def _cast_to_stata_types(data):
+    """Checks the dtypes of the columns of a pandas DataFrame for
+    compatibility with the data types and ranges supported by Stata, and
+    converts if necessary.
+
+    Parameters
+    ----------
+    data : DataFrame
+        The DataFrame to check and convert
+
+    Notes
+    -----
+    Numeric columns must be one of int8, int16, int32, float32 or float64, with
+    some additional value restrictions on the integer data types.  int8 and
+    int16 columns are checked for violations of the value restrictions and
+    upcast if needed.  int64 data is not usable in Stata, and so it is
+    downcast to int32 whenever the value are in the int32 range, and
+    sidecast to float64 when larger than this range.  If the int64 values
+    are outside of the range of those perfectly representable as float64 values,
+    a warning is raised.
+    """
+    ws = ''
+    for col in data:
+        dtype = data[col].dtype
+        if dtype == np.int8:
+            if data[col].max() > 100 or data[col].min() < -127:
+                data[col] = data[col].astype(np.int16)
+        elif dtype == np.int16:
+            if data[col].max() > 32740 or data[col].min() < -32767:
+                data[col] = data[col].astype(np.int32)
+        elif dtype == np.int64:
+            if data[col].max() <= 2147483620 and data[col].min() >= -2147483647:
+                data[col] = data[col].astype(np.int32)
+            else:
+                data[col] = data[col].astype(np.float64)
+                if data[col].max() <= 2 * 53 or data[col].min() >= -2 ** 53:
+                    ws = precision_loss_doc % ('int64', 'float64')
+
+    if ws:
+        import warnings
+
+        warnings.warn(ws, PossiblePrecisionLoss)
+
+    return data
+
+
 class StataMissingValue(StringMixin):
     """
     An observation's missing value.
@@ -193,14 +255,23 @@ class StataMissingValue(StringMixin):
     -----
     More information: <http://www.stata.com/help.cgi?missing>
     """
-
+    # TODO: Needs test
     def __init__(self, offset, value):
         self._value = value
-        if type(value) is int or type(value) is long:
-            self._str = value - offset is 1 and \
-                '.' or ('.' + chr(value - offset + 96))
+        value_type = type(value)
+        if value_type in int:
+            loc = value - offset
+        elif value_type in (float, np.float32, np.float64):
+            if value <= np.finfo(np.float32).max:  # float32
+                conv_str, byte_loc, scale = '<f', 1, 8
+            else:
+                conv_str, byte_loc, scale = '<d', 5, 1
+            value_bytes = struct.pack(conv_str, value)
+            loc = (struct.unpack('<b', value_bytes[byte_loc])[0] / scale) + 0
         else:
-            self._str = '.'
+            # Should never be hit
+            loc = 0
+        self._str = loc is 0 and '.' or ('.' + chr(loc + 96))
     string = property(lambda self: self._str,
                       doc="The Stata representation of the missing value: "
                           "'.', '.a'..'.z'")
@@ -240,9 +311,9 @@ def __init__(self, encoding):
             dict(
                 lzip(range(1, 245), ['a' + str(i) for i in range(1, 245)]) +
                 [
-                    (251, np.int16),
-                    (252, np.int32),
-                    (253, np.int64),
+                    (251, np.int8),
+                    (252, np.int16),
+                    (253, np.int32),
                     (254, np.float32),
                     (255, np.float64)
                 ]
@@ -253,9 +324,9 @@ def __init__(self, encoding):
                     (32768, np.string_),
                     (65526, np.float64),
                     (65527, np.float32),
-                    (65528, np.int64),
-                    (65529, np.int32),
-                    (65530, np.int16)
+                    (65528, np.int32),
+                    (65529, np.int16),
+                    (65530, np.int8)
                 ]
             )
         self.TYPE_MAP = lrange(251) + list('bhlfd')
@@ -272,13 +343,19 @@ def __init__(self, encoding):
         #NOTE: technically, some of these are wrong. there are more numbers
         # that can be represented. it's the 27 ABOVE and BELOW the max listed
         # numeric data type in [U] 12.2.2 of the 11.2 manual
-        self.MISSING_VALUES = \
+        float32_min = b'\xff\xff\xff\xfe'
+        float32_max = b'\xff\xff\xff\x7e'
+        float64_min = b'\xff\xff\xff\xff\xff\xff\xef\xff'
+        float64_max = b'\xff\xff\xff\xff\xff\xff\xdf\x7f'
+        self.VALID_RANGE = \
             {
                 'b': (-127, 100),
                 'h': (-32767, 32740),
                 'l': (-2147483647, 2147483620),
-                'f': (-1.701e+38, +1.701e+38),
-                'd': (-1.798e+308, +8.988e+307)
+                'f': (np.float32(struct.unpack('<f', float32_min)[0]),
+                      np.float32(struct.unpack('<f', float32_max)[0])),
+                'd': (np.float64(struct.unpack('<d', float64_min)[0]),
+                      np.float64(struct.unpack('<d', float64_max)[0]))
             }
 
         self.OLD_TYPE_MAPPING = \
@@ -287,6 +364,16 @@ def __init__(self, encoding):
                 'f': 254,
                 'b': 251
             }
+        # These missing values are the generic '.' in Stata, and are used
+        # to replace nans
+        self.MISSING_VALUES = \
+            {
+                'b': 101,
+                'h': 32741,
+                'l': 2147483621,
+                'f': np.float32(struct.unpack('<f', b'\x00\x00\x00\x7f')[0]),
+                'd': np.float64(struct.unpack('<d', b'\x00\x00\x00\x00\x00\x00\xe0\x7f')[0])
+            }
 
     def _decode_bytes(self, str, errors=None):
         if compat.PY3 or self._encoding is not None:
@@ -556,8 +643,8 @@ def _col_size(self, k=None):
 
     def _unpack(self, fmt, byt):
         d = struct.unpack(self.byteorder + fmt, byt)[0]
-        if fmt[-1] in self.MISSING_VALUES:
-            nmin, nmax = self.MISSING_VALUES[fmt[-1]]
+        if fmt[-1] in self.VALID_RANGE:
+            nmin, nmax = self.VALID_RANGE[fmt[-1]]
             if d < nmin or d > nmax:
                 if self._missing_values:
                     return StataMissingValue(nmax, d)
@@ -855,11 +942,12 @@ def _dtype_to_stata_type(dtype):
     See TYPE_MAP and comments for an explanation. This is also explained in
     the dta spec.
     1 - 244 are strings of this length
-    251 - chr(251) - for int8 and int16, byte
-    252 - chr(252) - for int32, int
-    253 - chr(253) - for int64, long
-    254 - chr(254) - for float32, float
-    255 - chr(255) - double, double
+                         Pandas    Stata
+    251 - chr(251) - for int8      byte
+    252 - chr(252) - for int16     int
+    253 - chr(253) - for int32     long
+    254 - chr(254) - for float32   float
+    255 - chr(255) - for double    double
 
     If there are dates to convert, then dtype will already have the correct
     type inserted.
@@ -878,8 +966,10 @@ def _dtype_to_stata_type(dtype):
     elif dtype == np.int64:
         return chr(253)
     elif dtype == np.int32:
+        return chr(253)
+    elif dtype == np.int16:
         return chr(252)
-    elif dtype == np.int8 or dtype == np.int16:
+    elif dtype == np.int8:
         return chr(251)
     else:  # pragma : no cover
         raise ValueError("Data type %s not currently understood. "
@@ -970,7 +1060,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True,
         self._file = _open_file_binary_write(
             fname, self._encoding or self._default_encoding
         )
-        self.type_converters = {253: np.long, 252: int}
+        self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8}
 
     def _write(self, to_write):
         """
@@ -990,11 +1080,14 @@ def __init__(self, data):
                 self.data = data
 
             def __iter__(self):
-                for i, row in data.iterrows():
-                    yield row
+                for row in data.itertuples():
+                    # First element is index, so remove
+                    yield row[1:]
 
         if self._write_index:
             data = data.reset_index()
+        # Check columns for compatbaility with stata
+        data = _cast_to_stata_types(data)
         self.datarows = DataFrameRowIter(data)
         self.nobs, self.nvar = data.shape
         self.data = data
@@ -1181,7 +1274,7 @@ def _write_data_dates(self):
                     self._write(var)
                 else:
                     if isnull(var):  # this only matters for floats
-                        var = MISSING_VALUES[typ]
+                        var = MISSING_VALUES[TYPE_MAP[typ]]
                     self._file.write(struct.pack(byteorder+TYPE_MAP[typ], var))
 
     def _null_terminate(self, s, as_string=False):
 
@@ -0,0 +1,19 @@
+byte_,int_,long_,float_,double_,date_td,string_,string_1
+0,0,0,0,0,,"a","a"
+1,1,1,1,1,,"ab","b"
+-1,-1,-1,-1,-1,,"abc","c"
+100,32740,-2147483647,-1.70100000027769e+38,-2.0000000000000e+307,1970-01-01,"abcdefghijklmnop","d"
+-127,-32767,2147483620,1.70100000027769e+38,8.0000000000000e+307,1970-01-02,"abcdefghijklmnopqrstuvwxyz","e"
+,0,,,,2014-01-01,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","f"
+0,,,,,2114-01-01,"1234567890","1"
+,,0,,,2014-12-31,"This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted","2"
+.a,.a,.a,.a,.a,2012-02-29,"!","A"
+100,32740,-2.15e+09,-1.70e+38,-2.0e+307,01jan1970,"abcdefghijklmnop","d"
+-127,-32767,2.15e+09,1.70e+38,8.0e+307,02jan1970,"abcdefghijklmnopqrstuvwxyz","e"
+,0,,,,01jan2014,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","f"
+0,,,,,01jan2114,"1234567890","1"
+,,0,,,31dec2014,"This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted","2"
+.a,.a,.a,.a,.a,29feb2012,"!","A"
+.z,.z,.z,.z,.z,,"&","Z"
+,,,0,,,"1.23","!"
+,,,,0,,"10jan1970","."
@@ -0,0 +1,6 @@
+byte_,int_,long_,float_,double_,date_td,string_,string_1
+0,0,0,0,0,1960-01-01,"a","a"
+1,1,1,1,1,3014-12-31,"ab","b"
+-1,-1,-1,-1,-1,2014-12-31,"abc","c"
+100,32740,-2147483647,-1.7010000002777e+38,-2.000000000000e+307,1970-01-01,"This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted","d"
+-127,-32767,2147483620,1.7010000002777e+38,8.000000000000e+307,1970-01-02,"abcdefghijklmnopqrstuvwxyz","e"