Merge commit 'v0.12.0rc1-127-gec8920a' into debian

yarikoptic · yarikoptic · commit bfd2e19e7b15 · 2013-07-20T09:22:41.000-04:00
* commit 'v0.12.0rc1-127-gec8920a': DOC: docs for precise_float option in read_json BUG: explicity change nan -> NaT when assigning to datelike dtypes ENH: expose ujson precise_float argument on decode ENH: ujson better handling of very large and very small numbers, throw ValueError for bad double_precision arg pandas-dev#4042 minor: some trailing spaces and a pylint "pragma" to stop complaining about Series._ix defined elsewhere ENH: test_perf.py - use psutil to set affinity (if absent functionality - then affinity module) TST: print out byteorder in ci/print_versions.py DOC: Fix typo. Update CONTRIBUTING.md with note on attribution in PRs
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -78,6 +78,15 @@ your contribution or address the issue you're having.
   - For extra brownie points, use "git rebase -i" to squash and reorder
     commits in your PR so that the history makes the most sense. Use your own
     judgment to decide what history needs to be preserved.
+  - Pandas source code should not (with some exceptions, such as 3rd party licensed code),
+    generally speaking, include an "Authors:" list or attribution to individuals in source code.
+    The RELEASE.rst details changes and enhancements to the code over time,
+    a "thanks goes to @JohnSmith." as part of the appropriate entry is a suitable way to acknowledge
+    contributions, the rest is git blame/log.
+    Feel free to ask the commiter who merges your code to include such an entry
+    or include it directly yourself as part of the PR if you'd like to. We're always glad to have
+    new contributors join us from the ever-growing pandas community.
+    You may also be interested in the copyright policy as detailed in the pandas [LICENSE](https://github.com/pydata/pandas/blob/master/LICENSE).
   - On the subject of [PEP8](http://www.python.org/dev/peps/pep-0008/): yes.
   - On the subject of massive PEP8 fix PRs touching everything, please consider the following:
     - They create merge conflicts for people working in their own fork.
diff --git a/README.rst b/README.rst
@@ -99,8 +99,8 @@ Optional dependencies
 
     - `BeautifulSoup4`_ and `html5lib`_ (Any recent version of `html5lib`_ is
       okay.)
-    - `BeautifulSoup4`_ and `lxml`_ 
-    - `BeautifulSoup4`_ and `html5lib`_ and `lxml`_ 
+    - `BeautifulSoup4`_ and `lxml`_
+    - `BeautifulSoup4`_ and `html5lib`_ and `lxml`_
     - Only `lxml`_, although see :ref:`HTML reading gotchas <html-gotchas>`
       for reasons as to why you should probably **not** take this approach.
 
diff --git a/ci/print_versions.py b/ci/print_versions.py
@@ -5,9 +5,10 @@
 print("------------------")
 print("Python: %d.%d.%d.%s.%s" % sys.version_info[:])
 try:
-    import os
+    import os, sys
     (sysname, nodename, release, version, machine) = os.uname()
     print("OS: %s %s %s %s" % (sysname, release, version,machine))
+    print("byteorder: %s" % sys.byteorder)
     print("LC_ALL: %s" % os.environ.get('LC_ALL',"None"))
     print("LANG: %s" % os.environ.get('LANG',"None"))
 except:
diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -1228,7 +1228,7 @@ You can get/set options directly as attributes of the top-level ``options`` attr
    pd.options.display.max_rows
 
 
-There is also an API composed of 4 relavent functions, available directly from the ``pandas``
+There is also an API composed of 4 relevant functions, available directly from the ``pandas``
 namespace, and they are:
 
 - ``get_option`` / ``set_option`` - get/set the value of a single option.
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -1060,6 +1060,8 @@ is ``None``. To explicity force ``Series`` parsing, pass ``typ=series``
 - ``keep_default_dates`` : boolean, default True. If parsing dates, then parse the default datelike columns
 - ``numpy`` : direct decoding to numpy arrays. default is False;
   Note that the JSON ordering **MUST** be the same for each term if ``numpy=True``
+- ``precise_float`` : boolean, default ``False``. Set to enable usage of higher precision (strtod) function
+  when decoding string to double values. Default (``False``) is to use fast but less precise builtin functionality
 
 The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is
 not parsable.
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -35,6 +35,9 @@ pandas 0.12
     list of ``DataFrame`` s courtesy of @cpcloud. (:issue:`3477`,
     :issue:`3605`, :issue:`3606`)
   - Support for reading Amazon S3 files. (:issue:`3504`)
+  - Added module for reading and writing JSON strings/files: pandas.io.json
+    includes ``to_json`` DataFrame/Series method, and a ``read_json`` top-level reader
+    various issues (:issue:`1226`, :issue:`3804`, :issue:`3876`, :issue:`3867`, :issue:`1305`)
   - Added module for reading and writing Stata files: pandas.io.stata (:issue:`1512`)
     includes ``to_stata`` DataFrame method, and a ``read_stata`` top-level reader
   - Added support for writing in ``to_csv`` and reading in ``read_csv``,
diff --git a/doc/source/v0.12.0.txt b/doc/source/v0.12.0.txt
@@ -206,6 +206,7 @@ I/O Enhancements
   - Added module for reading and writing json format files: ``pandas.io.json``
     accessable via ``read_json`` top-level function for reading,
     and ``to_json`` DataFrame method for writing, :ref:`See the docs<io.json>`
+    various issues (:issue:`1226`, :issue:`3804`, :issue:`3876`, :issue:`3867`, :issue:`1305`)
 
   - ``MultiIndex`` column support for reading and writing csv format files
 
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -42,6 +42,7 @@ class AmbiguousIndexError(PandasError, KeyError):
 _NS_DTYPE = np.dtype('M8[ns]')
 _TD_DTYPE = np.dtype('m8[ns]')
 _INT64_DTYPE = np.dtype(np.int64)
+_DATELIKE_DTYPES = set([ np.dtype(t) for t in ['M8[ns]','m8[ns]'] ])
 
 def isnull(obj):
     """Detect missing values (NaN in numeric arrays, None/NaN in object arrays)
@@ -718,6 +719,12 @@ def _infer_dtype_from_scalar(val):
     return dtype, val
 
 
+def _maybe_cast_scalar(dtype, value):
+    """ if we a scalar value and are casting to a dtype that needs nan -> NaT conversion """
+    if np.isscalar(value) and dtype in _DATELIKE_DTYPES and isnull(value):
+        return tslib.iNaT
+    return value
+
 def _maybe_promote(dtype, fill_value=np.nan):
 
     # if we passed an array here, determine the fill value by dtype
@@ -789,6 +796,7 @@ def _maybe_upcast_putmask(result, mask, other, dtype=None, change=None):
 
     if mask.any():
 
+        other = _maybe_cast_scalar(result.dtype, other)
         def changeit():
 
             # try to directly set by expanding our array to full
@@ -851,6 +859,7 @@ def _maybe_upcast_indexer(result, indexer, other, dtype=None):
         return the result and a changed flag
         """
 
+    other = _maybe_cast_scalar(result.dtype, other)
     original_dtype = result.dtype
     def changeit():
         # our type is wrong here, need to upcast
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -567,7 +567,7 @@ def axes(self):
 
     @property
     def ix(self):
-        if self._ix is None:
+        if self._ix is None: # defined in indexing.py; pylint: disable=E0203
             self._ix = _SeriesIndexer(self, 'ix')
 
         return self._ix
diff --git a/pandas/io/json.py b/pandas/io/json.py
@@ -16,9 +16,9 @@
 ### interface to/from ###
 
 def to_json(path_or_buf, obj, orient=None, date_format='epoch', double_precision=10, force_ascii=True):
-        
+
     if isinstance(obj, Series):
-        s = SeriesWriter(obj, orient=orient, date_format=date_format, double_precision=double_precision, 
+        s = SeriesWriter(obj, orient=orient, date_format=date_format, double_precision=double_precision,
                          ensure_ascii=force_ascii).write()
     elif isinstance(obj, DataFrame):
         s = FrameWriter(obj, orient=orient, date_format=date_format, double_precision=double_precision,
@@ -41,7 +41,7 @@ def __init__(self, obj, orient, date_format, double_precision, ensure_ascii):
 
         if orient is None:
             orient = self._default_orient
-            
+
         self.orient = orient
         self.date_format = date_format
         self.double_precision = double_precision
@@ -64,7 +64,7 @@ def _format_to_date(self, data):
         if self._needs_to_date(data):
             return data.apply(lambda x: x.isoformat())
         return data
-    
+
     def copy_if_needed(self):
         """ copy myself if necessary """
         if not self.is_copy:
@@ -119,7 +119,8 @@ def _format_dates(self):
                     self.obj[c] = self._format_to_date(self.obj[c])
 
 def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
-              convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False):
+              convert_axes=True, convert_dates=True, keep_default_dates=True,
+              numpy=False, precise_float=False):
     """
     Convert JSON string to pandas object
 
@@ -154,8 +155,10 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
         default is True
     keep_default_dates : boolean, default True. If parsing dates,
         then parse the default datelike columns
-    numpy: direct decoding to numpy arrays. default is False.Note that the JSON ordering MUST be the same
+    numpy : direct decoding to numpy arrays. default is False.Note that the JSON ordering MUST be the same
         for each term if numpy=True.
+    precise_float : boolean, default False. Set to enable usage of higher precision (strtod) function
+        when decoding string to double values. Default (False) is to use fast but less precise builtin functionality
 
     Returns
     -------
@@ -186,28 +189,31 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
     return obj
 
 class Parser(object):
-    
-    def __init__(self, json, orient, dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=False, numpy=False):
+
+    def __init__(self, json, orient, dtype=True, convert_axes=True,
+                 convert_dates=True, keep_default_dates=False, numpy=False,
+                 precise_float=False):
         self.json = json
 
         if orient is None:
             orient = self._default_orient
-            
+
         self.orient = orient
         self.dtype = dtype
 
         if orient == "split":
             numpy = False
 
         self.numpy = numpy
+        self.precise_float = precise_float
         self.convert_axes  = convert_axes
         self.convert_dates = convert_dates
         self.keep_default_dates = keep_default_dates
         self.obj = None
 
     def parse(self):
 
-        # try numpy 
+        # try numpy
         numpy = self.numpy
         if numpy:
             self._parse_numpy()
@@ -269,7 +275,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True):
                 pass
 
         if data.dtype == 'float':
-            
+
             # coerce floats to 64
             try:
                 data = data.astype('float64')
@@ -291,7 +297,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True):
 
         # coerce ints to 64
         if data.dtype == 'int':
-            
+
             # coerce floats to 64
             try:
                 data = data.astype('int64')
@@ -322,7 +328,7 @@ def _try_convert_to_date(self, data):
         if issubclass(new_data.dtype.type,np.number):
             if not ((new_data == iNaT) | (new_data > 31536000000000000L)).all():
                 return data, False
-                
+
         try:
             new_data = to_datetime(new_data)
         except:
@@ -342,29 +348,35 @@ class SeriesParser(Parser):
     _default_orient = 'index'
 
     def _parse_no_numpy(self):
-    
+
         json = self.json
         orient = self.orient
         if orient == "split":
             decoded = dict((str(k), v)
-                           for k, v in loads(json).iteritems())
+                           for k, v in loads(
+                               json,
+                               precise_float=self.precise_float).iteritems())
             self.obj = Series(dtype=None, **decoded)
         else:
-            self.obj = Series(loads(json), dtype=None)
+            self.obj = Series(
+                loads(json, precise_float=self.precise_float), dtype=None)
 
     def _parse_numpy(self):
 
         json = self.json
         orient = self.orient
         if orient == "split":
-            decoded = loads(json, dtype=None, numpy=True)
+            decoded = loads(json, dtype=None, numpy=True,
+                            precise_float=self.precise_float)
             decoded = dict((str(k), v) for k, v in decoded.iteritems())
             self.obj = Series(**decoded)
         elif orient == "columns" or orient == "index":
             self.obj = Series(*loads(json, dtype=None, numpy=True,
-                                     labelled=True))
+                                     labelled=True,
+                                     precise_float=self.precise_float))
         else:
-            self.obj = Series(loads(json, dtype=None, numpy=True))
+            self.obj = Series(loads(json, dtype=None, numpy=True,
+                                    precise_float=self.precise_float))
 
     def _try_convert_types(self):
         if self.obj is None: return
@@ -381,34 +393,43 @@ def _parse_numpy(self):
         orient = self.orient
 
         if orient == "columns":
-            args = loads(json, dtype=None, numpy=True, labelled=True)
+            args = loads(json, dtype=None, numpy=True, labelled=True,
+                         precise_float=self.precise_float)
             if args:
                 args = (args[0].T, args[2], args[1])
             self.obj = DataFrame(*args)
         elif orient == "split":
-            decoded = loads(json, dtype=None, numpy=True)
+            decoded = loads(json, dtype=None, numpy=True,
+                            precise_float=self.precise_float)
             decoded = dict((str(k), v) for k, v in decoded.iteritems())
             self.obj = DataFrame(**decoded)
         elif orient == "values":
-            self.obj = DataFrame(loads(json, dtype=None, numpy=True))
+            self.obj = DataFrame(loads(json, dtype=None, numpy=True,
+                                       precise_float=self.precise_float))
         else:
-            self.obj = DataFrame(*loads(json, dtype=None, numpy=True, labelled=True))
+            self.obj = DataFrame(*loads(json, dtype=None, numpy=True, labelled=True,
+                                        precise_float=self.precise_float))
 
     def _parse_no_numpy(self):
 
         json = self.json
         orient = self.orient
 
         if orient == "columns":
-            self.obj = DataFrame(loads(json), dtype=None)
+            self.obj = DataFrame(
+                loads(json, precise_float=self.precise_float), dtype=None)
         elif orient == "split":
             decoded = dict((str(k), v)
-                           for k, v in loads(json).iteritems())
+                           for k, v in loads(
+                               json,
+                               precise_float=self.precise_float).iteritems())
             self.obj = DataFrame(dtype=None, **decoded)
         elif orient == "index":
-            self.obj = DataFrame(loads(json), dtype=None).T
+            self.obj = DataFrame(
+                loads(json, precise_float=self.precise_float), dtype=None).T
         else:
-            self.obj = DataFrame(loads(json), dtype=None)
+            self.obj = DataFrame(
+                loads(json, precise_float=self.precise_float), dtype=None)
 
     def _try_convert_types(self):
         if self.obj is None: return
diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py
@@ -289,6 +289,16 @@ def test_series_to_json_except(self):
         s = Series([1, 2, 3])
         self.assertRaises(ValueError, s.to_json, orient="garbage")
 
+    def test_series_from_json_precise_float(self):
+        s = Series([4.56, 4.56, 4.56])
+        result = read_json(s.to_json(), typ='series', precise_float=True)
+        assert_series_equal(result, s)
+
+    def test_frame_from_json_precise_float(self):
+        df = DataFrame([[4.56, 4.56, 4.56], [4.56, 4.56, 4.56]])
+        result = read_json(df.to_json(), precise_float=True)
+        assert_frame_equal(result, df)
+
     def test_typ(self):
 
         s = Series(range(6), index=['a','b','c','d','e','f'], dtype='int64')
diff --git a/pandas/io/tests/test_json/test_ujson.py b/pandas/io/tests/test_json/test_ujson.py
diff --git a/pandas/src/ujson/lib/ultrajsonenc.c b/pandas/src/ujson/lib/ultrajsonenc.c
diff --git a/pandas/src/ujson/python/objToJSON.c b/pandas/src/ujson/python/objToJSON.c
diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py
diff --git a/vb_suite/test_perf.py b/vb_suite/test_perf.py