From 3e8db64d0cab59eee16a4534f001b03bc5b2454d Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 15 Jan 2014 17:29:21 -0500 Subject: [PATCH] API: add reads_json, reads_msgpack, reads_csv, reads_pickle as convience functions which accept a string or string-like data input for reading like the file equivalent (GH5655, GH5874) --- pandas/io/api.py | 8 +- pandas/io/common.py | 33 ++++++- pandas/io/json.py | 30 +++---- pandas/io/packers.py | 29 ++----- pandas/io/parsers.py | 3 +- pandas/io/pickle.py | 4 +- pandas/io/tests/test_json/test_pandas.py | 104 +++++++++++------------ pandas/io/tests/test_packers.py | 14 +-- 8 files changed, 119 insertions(+), 106 deletions(-) diff --git a/pandas/io/api.py b/pandas/io/api.py index cf3615cd822cd..9235497d0d608 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -2,14 +2,14 @@ Data IO api """ -from pandas.io.parsers import read_csv, read_table, read_fwf +from pandas.io.parsers import read_csv, reads_csv, read_table, read_fwf from pandas.io.clipboard import read_clipboard from pandas.io.excel import ExcelFile, ExcelWriter, read_excel from pandas.io.pytables import HDFStore, Term, get_store, read_hdf -from pandas.io.json import read_json +from pandas.io.json import read_json, reads_json from pandas.io.html import read_html from pandas.io.sql import read_sql from pandas.io.stata import read_stata -from pandas.io.pickle import read_pickle, to_pickle -from pandas.io.packers import read_msgpack, to_msgpack +from pandas.io.pickle import read_pickle, reads_pickle, to_pickle +from pandas.io.packers import read_msgpack, reads_msgpack, to_msgpack from pandas.io.gbq import read_gbq diff --git a/pandas/io/common.py b/pandas/io/common.py index d6b2827f94d36..a4bcaa2016b28 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -6,7 +6,7 @@ from pandas.compat import StringIO from pandas import compat - +from functools import wraps if compat.PY3: from urllib.request import urlopen, pathname2url @@ -45,6 +45,37 @@ class PerformanceWarning(Warning): class DtypeWarning(Warning): pass +def _create_string_file_reader(func): + """ + create and return a new function that takes string input and + passed to a file-like reader function, + e.g. read_json + + Parameters + ---------- + func : the function with a file-like interface + + Returns + ------- + new function that transform input to file-like + """ + + @wraps(func) + def f(path_or_buf, *args, **kwargs): + if not hasattr(path_or_buf,'read'): + if isinstance(path_or_buf, compat.string_types): + path_or_buf = StringIO(path_or_buf) + elif isinstance(path_or_buf, compat.binary_type): + path_or_buf = compat.BytesIO(path_or_buf) + try: + return func(path_or_buf, *args, **kwargs) + finally: + if not ('iterator' in kwargs or 'chunksize' in kwargs): + path_or_buf.close() + + return func(path_or_buf, *args, **kwargs) + + return f def _is_url(url): """Check to see if a URL has a valid protocol. diff --git a/pandas/io/json.py b/pandas/io/json.py index 698f7777a1100..c41f1759b4d7f 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -10,7 +10,7 @@ from pandas.compat import long, u from pandas import compat, isnull from pandas import Series, DataFrame, to_datetime -from pandas.io.common import get_filepath_or_buffer +from pandas.io.common import get_filepath_or_buffer, _create_string_file_reader import pandas.core.common as com loads = _json.loads @@ -109,7 +109,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, Parameters ---------- - filepath_or_buffer : a valid JSON string or file-like + path_or_buffer : a valid file-like The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. For instance, a local file could be ``file://localhost/path/to/table.json`` @@ -171,25 +171,14 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, result : Series or DataFrame """ - filepath_or_buffer, _ = get_filepath_or_buffer(path_or_buf) - if isinstance(filepath_or_buffer, compat.string_types): - try: - exists = os.path.exists(filepath_or_buffer) - - # if the filepath is too long will raise here - # 5874 - except (TypeError,ValueError): - exists = False - - if exists: - with open(filepath_or_buffer, 'r') as fh: - json = fh.read() - else: - json = filepath_or_buffer - elif hasattr(filepath_or_buffer, 'read'): - json = filepath_or_buffer.read() + path_or_buffer, _ = get_filepath_or_buffer(path_or_buf) + if isinstance(path_or_buffer, compat.string_types): + with open(path_or_buffer, 'r') as fh: + json = fh.read() + elif hasattr(path_or_buffer, 'read'): + json = path_or_buffer.read() else: - json = filepath_or_buffer + raise ValueError("path_or_buffer must be a file or file-like buffer") obj = None if typ == 'frame': @@ -206,6 +195,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, return obj +reads_json = _create_string_file_reader(read_json) class Parser(object): diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 105bea92124fd..8475972f493e6 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -56,7 +56,7 @@ from pandas.sparse.array import BlockIndex, IntIndex from pandas.core.generic import NDFrame from pandas.core.common import needs_i8_conversion -from pandas.io.common import get_filepath_or_buffer +from pandas.io.common import get_filepath_or_buffer, _create_string_file_reader from pandas.core.internals import BlockManager, make_block import pandas.core.internals as internals @@ -124,7 +124,7 @@ def read_msgpack(path_or_buf, iterator=False, **kwargs): Parameters ---------- - path_or_buf : string File path, BytesIO like or string + path_or_buf : string File path or buffer iterator : boolean, if True, return an iterator to the unpacker (default is False) @@ -146,26 +146,15 @@ def read(fh): # see if we have an actual file if isinstance(path_or_buf, compat.string_types): - try: - exists = os.path.exists(path_or_buf) - except (TypeError,ValueError): - exists = False - - if exists: - with open(path_or_buf, 'rb') as fh: - return read(fh) - - # treat as a string-like - if not hasattr(path_or_buf, 'read'): - - try: - fh = compat.BytesIO(path_or_buf) + with open(path_or_buf, 'rb') as fh: return read(fh) - finally: - fh.close() - # a buffer like - return read(path_or_buf) + elif hasattr(path_or_buf, 'read'): + return read(path_or_buf) + else: + raise ValueError("path_or_buffer must be a file or file-like buffer") + +reads_msgpack = _create_string_file_reader(read_msgpack) dtype_dict = {21: np.dtype('M8[ns]'), u('datetime64[ns]'): np.dtype('M8[ns]'), diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 813b7e59e107a..8dbf2b39063d6 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -15,7 +15,7 @@ import pandas.core.common as com from pandas.core.config import get_option from pandas.io.date_converters import generic_parser -from pandas.io.common import get_filepath_or_buffer +from pandas.io.common import get_filepath_or_buffer, _create_string_file_reader from pandas.util.decorators import Appender @@ -417,6 +417,7 @@ def parser_f(filepath_or_buffer, read_csv = _make_parser_function('read_csv', sep=',') read_csv = Appender(_read_csv_doc)(read_csv) +reads_csv = _create_string_file_reader(read_csv) read_table = _make_parser_function('read_table', sep='\t') read_table = Appender(_read_table_doc)(read_table) diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 915c1e9ae1574..de191f1df5475 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -1,5 +1,5 @@ from pandas.compat import cPickle as pkl, pickle_compat as pc, PY3 - +from pandas.io.common import _create_string_file_reader def to_pickle(obj, path): """ @@ -51,3 +51,5 @@ def try_read(path, encoding=None): if PY3: return try_read(path, encoding='latin1') raise + +reads_pickle= _create_string_file_reader(read_pickle) diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py index 084bc63188e2b..5b710771b88e5 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -7,7 +7,7 @@ from pandas import Series, DataFrame, DatetimeIndex, Timestamp import pandas as pd -read_json = pd.read_json +from pandas import reads_json from pandas.util.testing import (assert_almost_equal, assert_frame_equal, assert_series_equal, network, @@ -72,13 +72,13 @@ def test_frame_double_encoded_labels(self): index=['index " 1', 'index / 2'], columns=['a \\ b', 'y / z']) - assert_frame_equal(df, read_json(df.to_json(orient='split'), + assert_frame_equal(df, reads_json(df.to_json(orient='split'), orient='split')) - assert_frame_equal(df, read_json(df.to_json(orient='columns'), + assert_frame_equal(df, reads_json(df.to_json(orient='columns'), orient='columns')) - assert_frame_equal(df, read_json(df.to_json(orient='index'), + assert_frame_equal(df, reads_json(df.to_json(orient='index'), orient='index')) - df_unser = read_json(df.to_json(orient='records'), orient='records') + df_unser = reads_json(df.to_json(orient='records'), orient='records') assert_index_equal(df.columns, df_unser.columns) np.testing.assert_equal(df.values, df_unser.values) @@ -89,12 +89,12 @@ def test_frame_non_unique_index(self): self.assertRaises(ValueError, df.to_json, orient='index') self.assertRaises(ValueError, df.to_json, orient='columns') - assert_frame_equal(df, read_json(df.to_json(orient='split'), + assert_frame_equal(df, reads_json(df.to_json(orient='split'), orient='split')) - unser = read_json(df.to_json(orient='records'), orient='records') + unser = reads_json(df.to_json(orient='records'), orient='records') self.assertTrue(df.columns.equals(unser.columns)) np.testing.assert_equal(df.values, unser.values) - unser = read_json(df.to_json(orient='values'), orient='values') + unser = reads_json(df.to_json(orient='values'), orient='values') np.testing.assert_equal(df.values, unser.values) def test_frame_non_unique_columns(self): @@ -105,18 +105,18 @@ def test_frame_non_unique_columns(self): self.assertRaises(ValueError, df.to_json, orient='columns') self.assertRaises(ValueError, df.to_json, orient='records') - assert_frame_equal(df, read_json(df.to_json(orient='split'), + assert_frame_equal(df, reads_json(df.to_json(orient='split'), orient='split', dtype=False)) - unser = read_json(df.to_json(orient='values'), orient='values') + unser = reads_json(df.to_json(orient='values'), orient='values') np.testing.assert_equal(df.values, unser.values) # GH4377; duplicate columns not processing correctly df = DataFrame([['a','b'],['c','d']], index=[1,2], columns=['x','y']) - result = read_json(df.to_json(orient='split'), orient='split') + result = reads_json(df.to_json(orient='split'), orient='split') assert_frame_equal(result, df) def _check(df): - result = read_json(df.to_json(orient='split'), orient='split', + result = reads_json(df.to_json(orient='split'), orient='split', convert_dates=['x']) assert_frame_equal(result, df) @@ -133,7 +133,7 @@ def _check_orient(df, orient, dtype=None, numpy=False, dfjson = df.to_json(orient=orient) try: - unser = read_json(dfjson, orient=orient, dtype=dtype, + unser = reads_json(dfjson, orient=orient, dtype=dtype, numpy=numpy, convert_axes=convert_axes) except Exception as detail: if raise_ok is not None: @@ -259,20 +259,20 @@ def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None): _check_orient(df.transpose().transpose(), "index", dtype=False) def test_frame_from_json_bad_data(self): - self.assertRaises(ValueError, read_json, StringIO('{"key":b:a:d}')) + self.assertRaises(ValueError, reads_json, StringIO('{"key":b:a:d}')) # too few indices json = StringIO('{"columns":["A","B"],' '"index":["2","3"],' '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}') - self.assertRaises(ValueError, read_json, json, + self.assertRaises(ValueError, reads_json, json, orient="split") # too many columns json = StringIO('{"columns":["A","B","C"],' '"index":["1","2","3"],' '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}') - self.assertRaises(AssertionError, read_json, json, + self.assertRaises(AssertionError, reads_json, json, orient="split") # bad key @@ -280,41 +280,41 @@ def test_frame_from_json_bad_data(self): '"index":["2","3"],' '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}') with tm.assertRaisesRegexp(ValueError, r"unexpected key\(s\): badkey"): - read_json(json, orient="split") + reads_json(json, orient="split") def test_frame_from_json_nones(self): df = DataFrame([[1, 2], [4, 5, 6]]) - unser = read_json(df.to_json()) + unser = reads_json(df.to_json()) self.assertTrue(np.isnan(unser[2][0])) df = DataFrame([['1', '2'], ['4', '5', '6']]) - unser = read_json(df.to_json()) + unser = reads_json(df.to_json()) self.assertTrue(np.isnan(unser[2][0])) - unser = read_json(df.to_json(),dtype=False) + unser = reads_json(df.to_json(),dtype=False) self.assertTrue(unser[2][0] is None) - unser = read_json(df.to_json(),convert_axes=False,dtype=False) + unser = reads_json(df.to_json(),convert_axes=False,dtype=False) self.assertTrue(unser['2']['0'] is None) - unser = read_json(df.to_json(), numpy=False) + unser = reads_json(df.to_json(), numpy=False) self.assertTrue(np.isnan(unser[2][0])) - unser = read_json(df.to_json(), numpy=False, dtype=False) + unser = reads_json(df.to_json(), numpy=False, dtype=False) self.assertTrue(unser[2][0] is None) - unser = read_json(df.to_json(), numpy=False, convert_axes=False, dtype=False) + unser = reads_json(df.to_json(), numpy=False, convert_axes=False, dtype=False) self.assertTrue(unser['2']['0'] is None) # infinities get mapped to nulls which get mapped to NaNs during # deserialisation df = DataFrame([[1, 2], [4, 5, 6]]) df[2][0] = np.inf - unser = read_json(df.to_json()) + unser = reads_json(df.to_json()) self.assertTrue(np.isnan(unser[2][0])) - unser = read_json(df.to_json(), dtype=False) + unser = reads_json(df.to_json(), dtype=False) self.assertTrue(np.isnan(unser[2][0])) df[2][0] = np.NINF - unser = read_json(df.to_json()) + unser = reads_json(df.to_json()) self.assertTrue(np.isnan(unser[2][0])) - unser = read_json(df.to_json(),dtype=False) + unser = reads_json(df.to_json(),dtype=False) self.assertTrue(np.isnan(unser[2][0])) def test_frame_to_json_except(self): @@ -350,9 +350,9 @@ def test_series_non_unique_index(self): self.assertRaises(ValueError, s.to_json, orient='index') - assert_series_equal(s, read_json(s.to_json(orient='split'), + assert_series_equal(s, reads_json(s.to_json(orient='split'), orient='split', typ='series')) - unser = read_json(s.to_json(orient='records'), + unser = reads_json(s.to_json(orient='records'), orient='records', typ='series') np.testing.assert_equal(s.values, unser.values) @@ -360,7 +360,7 @@ def test_series_from_json_to_json(self): def _check_orient(series, orient, dtype=None, numpy=False): series = series.sort_index() - unser = read_json(series.to_json(orient=orient), + unser = reads_json(series.to_json(orient=orient), typ='series', orient=orient, numpy=numpy, dtype=dtype) unser = unser.sort_index() @@ -410,24 +410,24 @@ def test_series_to_json_except(self): def test_series_from_json_precise_float(self): s = Series([4.56, 4.56, 4.56]) - result = read_json(s.to_json(), typ='series', precise_float=True) + result = reads_json(s.to_json(), typ='series', precise_float=True) assert_series_equal(result, s) def test_frame_from_json_precise_float(self): df = DataFrame([[4.56, 4.56, 4.56], [4.56, 4.56, 4.56]]) - result = read_json(df.to_json(), precise_float=True) + result = reads_json(df.to_json(), precise_float=True) assert_frame_equal(result, df) def test_typ(self): s = Series(lrange(6), index=['a','b','c','d','e','f'], dtype='int64') - result = read_json(s.to_json(),typ=None) + result = reads_json(s.to_json(),typ=None) assert_series_equal(result,s) def test_reconstruction_index(self): df = DataFrame([[1, 2, 3], [4, 5, 6]]) - result = read_json(df.to_json()) + result = reads_json(df.to_json()) # the index is serialized as strings....correct? assert_frame_equal(result, df) @@ -437,18 +437,18 @@ def test_path(self): for df in [self.frame, self.frame2, self.intframe, self.tsframe, self.mixed_frame]: df.to_json(path) - read_json(path) + pd.read_json(path) def test_axis_dates(self): # frame json = self.tsframe.to_json() - result = read_json(json) + result = reads_json(json) assert_frame_equal(result, self.tsframe) # series json = self.ts.to_json() - result = read_json(json, typ='series') + result = reads_json(json, typ='series') assert_series_equal(result, self.ts) def test_convert_dates(self): @@ -458,12 +458,12 @@ def test_convert_dates(self): df['date'] = Timestamp('20130101') json = df.to_json() - result = read_json(json) + result = reads_json(json) assert_frame_equal(result, df) df['foo'] = 1. json = df.to_json(date_unit='ns') - result = read_json(json, convert_dates=False) + result = reads_json(json, convert_dates=False) expected = df.copy() expected['date'] = expected['date'].values.view('i8') expected['foo'] = expected['foo'].astype('int64') @@ -472,7 +472,7 @@ def test_convert_dates(self): # series ts = Series(Timestamp('20130101'), index=self.ts.index) json = ts.to_json() - result = read_json(json, typ='series') + result = reads_json(json, typ='series') assert_series_equal(result, ts) def test_date_format_frame(self): @@ -486,7 +486,7 @@ def test_w_date(date, date_unit=None): json = df.to_json(date_format='iso', date_unit=date_unit) else: json = df.to_json(date_format='iso') - result = read_json(json) + result = reads_json(json) assert_frame_equal(result, df) test_w_date('20130101 20:43:42.123') @@ -507,7 +507,7 @@ def test_w_date(date, date_unit=None): json = ts.to_json(date_format='iso', date_unit=date_unit) else: json = ts.to_json(date_format='iso') - result = read_json(json, typ='series') + result = reads_json(json, typ='series') assert_series_equal(result, ts) test_w_date('20130101 20:43:42.123') @@ -531,11 +531,11 @@ def test_date_unit(self): json = df.to_json(date_format='epoch', date_unit=unit) # force date unit - result = read_json(json, date_unit=unit) + result = reads_json(json, date_unit=unit) assert_frame_equal(result, df) # detect date unit - result = read_json(json, date_unit=None) + result = reads_json(json, date_unit=None) assert_frame_equal(result, df) def test_weird_nested_json(self): @@ -558,7 +558,7 @@ def test_weird_nested_json(self): } }''' - read_json(s) + reads_json(s) def test_doc_example(self): dfj2 = DataFrame(np.random.randn(5, 2), columns=list('AB')) @@ -568,19 +568,19 @@ def test_doc_example(self): dfj2.index = pd.date_range('20130101',periods=5) json = dfj2.to_json() - result = read_json(json,dtype={'ints' : np.int64, 'bools' : np.bool_}) + result = reads_json(json,dtype={'ints' : np.int64, 'bools' : np.bool_}) assert_frame_equal(result,result) def test_misc_example(self): # parsing unordered input fails - result = read_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]',numpy=True) + result = reads_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]',numpy=True) expected = DataFrame([[1,2],[1,2]],columns=['a','b']) with tm.assertRaisesRegexp(AssertionError, '\[index\] left \[.+\], right \[.+\]'): assert_frame_equal(result, expected) - result = read_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]') + result = reads_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]') expected = DataFrame([[1,2],[1,2]],columns=['a','b']) assert_frame_equal(result,expected) @@ -590,13 +590,13 @@ def test_round_trip_exception_(self): csv = 'https://raw.github.com/hayd/lahman2012/master/csvs/Teams.csv' df = pd.read_csv(csv) s = df.to_json() - result = pd.read_json(s) + result = pd.reads_json(s) assert_frame_equal(result.reindex(index=df.index,columns=df.columns),df) @network def test_url(self): url = 'https://api.github.com/repos/pydata/pandas/issues?per_page=5' - result = read_json(url, convert_dates=True) + result = pd.read_json(url, convert_dates=True) for c in ['created_at', 'closed_at', 'updated_at']: self.assertEqual(result[c].dtype, 'datetime64[ns]') @@ -606,7 +606,7 @@ def test_default_handler(self): self.assertRaises(OverflowError, frame.to_json) expected = DataFrame([str(timedelta(23)), str(timedelta(seconds=5))]) assert_frame_equal( - expected, pd.read_json(frame.to_json(default_handler=str))) + expected, pd.reads_json(frame.to_json(default_handler=str))) def my_handler_raises(obj): raise TypeError("raisin") diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index 8cab9a65995bf..957cde34cf616 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -16,13 +16,13 @@ from pandas.tests.test_frame import assert_frame_equal from pandas.tests.test_panel import assert_panel_equal -import pandas +import pandas as pd from pandas.sparse.tests.test_sparse import assert_sp_series_equal, assert_sp_frame_equal from pandas import Timestamp, tslib nan = np.nan -from pandas.io.packers import to_msgpack, read_msgpack +from pandas.io.packers import to_msgpack, read_msgpack, reads_msgpack _multiprocess_can_split_ = False @@ -62,19 +62,19 @@ def test_string_io(self): df = DataFrame(np.random.randn(10,2)) s = df.to_msgpack(None) - result = read_msgpack(s) + result = reads_msgpack(s) tm.assert_frame_equal(result,df) s = df.to_msgpack() - result = read_msgpack(s) + result = reads_msgpack(s) tm.assert_frame_equal(result,df) s = df.to_msgpack() - result = read_msgpack(compat.BytesIO(s)) + result = reads_msgpack(compat.BytesIO(s)) tm.assert_frame_equal(result,df) s = to_msgpack(None,df) - result = read_msgpack(s) + result = reads_msgpack(s) tm.assert_frame_equal(result, df) with ensure_clean(self.path) as p: @@ -90,7 +90,7 @@ def test_iterator_with_string_io(self): dfs = [ DataFrame(np.random.randn(10,2)) for i in range(5) ] s = to_msgpack(None,*dfs) - for i, result in enumerate(read_msgpack(s,iterator=True)): + for i, result in enumerate(reads_msgpack(s,iterator=True)): tm.assert_frame_equal(result,dfs[i]) class TestNumpy(TestPackers):