diff --git a/pandas/computation/align.py b/pandas/computation/align.py index b61169e1f55e0..b2b7fcc3e1158 100644 --- a/pandas/computation/align.py +++ b/pandas/computation/align.py @@ -152,7 +152,9 @@ def _align_core(terms): copy=False) # need to fill if we have a bool dtype/array - if isinstance(ti, (np.ndarray, pd.Series)) and ti.dtype == object and pd.lib.is_bool_array(ti.values): + if (isinstance(ti, (np.ndarray, pd.Series)) + and ti.dtype == object + and pd.lib.is_bool_array(ti.values)): r = f(fill_value=True) else: r = f() diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 0baa596778996..c16205ff34b1f 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -512,18 +512,21 @@ def _possibly_evaluate_binop(self, op, op_class, lhs, rhs, res = op(lhs, rhs) if self.engine != 'pytables': - if (res.op in _cmp_ops_syms and getattr(lhs,'is_datetime',False) or getattr(rhs,'is_datetime',False)): - # all date ops must be done in python bc numexpr doesn't work well - # with NaT + if (res.op in _cmp_ops_syms + and getattr(lhs, 'is_datetime', False) + or getattr(rhs, 'is_datetime', False)): + # all date ops must be done in python bc numexpr doesn't work + # well with NaT return self._possibly_eval(res, self.binary_ops) if res.op in eval_in_python: # "in"/"not in" ops are always evaluated in python return self._possibly_eval(res, eval_in_python) elif self.engine != 'pytables': - if (getattr(lhs,'return_type',None) == object or getattr(rhs,'return_type',None) == object): - # evaluate "==" and "!=" in python if either of our operands has an - # object return type + if (getattr(lhs, 'return_type', None) == object + or getattr(rhs, 'return_type', None) == object): + # evaluate "==" and "!=" in python if either of our operands + # has an object return type return self._possibly_eval(res, eval_in_python + maybe_eval_in_python) return res diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index cfbd9335ef9a0..073526f526abe 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -1022,7 +1022,8 @@ def check_performance_warning_for_poor_alignment(self, engine, parser): def test_performance_warning_for_poor_alignment(self): for engine, parser in ENGINES_PARSERS: - yield self.check_performance_warning_for_poor_alignment, engine, parser + yield (self.check_performance_warning_for_poor_alignment, engine, + parser) #------------------------------------ diff --git a/pandas/core/format.py b/pandas/core/format.py index 7135573d48644..45bf07b49eead 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -264,8 +264,8 @@ class DataFrameFormatter(TableFormatter): def __init__(self, frame, buf=None, columns=None, col_space=None, header=True, index=True, na_rep='NaN', formatters=None, justify=None, float_format=None, sparsify=None, - index_names=True, line_width=None, max_rows=None, max_cols=None, - show_dimensions=False, **kwds): + index_names=True, line_width=None, max_rows=None, + max_cols=None, show_dimensions=False, **kwds): self.frame = frame self.buf = buf if buf is not None else StringIO() self.show_index_names = index_names @@ -284,7 +284,8 @@ def __init__(self, frame, buf=None, columns=None, col_space=None, self.line_width = line_width self.max_rows = max_rows self.max_cols = max_cols - self.max_rows_displayed = min(max_rows or len(self.frame),len(self.frame)) + self.max_rows_displayed = min(max_rows or len(self.frame), + len(self.frame)) self.show_dimensions = show_dimensions if justify is None: @@ -330,7 +331,8 @@ def _to_str_columns(self): *(_strlen(x) for x in cheader)) fmt_values = _make_fixed_width(fmt_values, self.justify, - minimum=max_colwidth, truncated=truncate_v) + minimum=max_colwidth, + truncated=truncate_v) max_len = max(np.max([_strlen(x) for x in fmt_values]), max_colwidth) @@ -349,8 +351,8 @@ def _to_str_columns(self): if self.index: strcols.insert(0, str_index) if truncate_h: - strcols.append(([''] * len(str_columns[-1])) \ - + (['...'] * min(len(self.frame), self.max_rows)) ) + strcols.append(([''] * len(str_columns[-1])) + + (['...'] * min(len(self.frame), self.max_rows))) return strcols @@ -382,8 +384,8 @@ def to_string(self, force_unicode=None): self.buf.writelines(text) if self.show_dimensions: - self.buf.write("\n\n[%d rows x %d columns]" \ - % (len(frame), len(frame.columns)) ) + self.buf.write("\n\n[%d rows x %d columns]" + % (len(frame), len(frame.columns))) def _join_multiline(self, *strcols): lwidth = self.line_width @@ -484,10 +486,11 @@ def write(buf, frame, column_format, strcols): def _format_col(self, i): formatter = self._get_formatter(i) - return format_array((self.frame.iloc[:self.max_rows_displayed,i]).get_values(), - formatter, float_format=self.float_format, - na_rep=self.na_rep, - space=self.col_space) + return format_array( + (self.frame.iloc[:self.max_rows_displayed, i]).get_values(), + formatter, float_format=self.float_format, na_rep=self.na_rep, + space=self.col_space + ) def to_html(self, classes=None): """ @@ -679,8 +682,6 @@ def write_result(self, buf): 'not %s') % type(self.classes)) _classes.extend(self.classes) - - self.write('' % ' '.join(_classes), indent) @@ -698,9 +699,9 @@ def write_result(self, buf): self.write('
', indent) if self.fmt.show_dimensions: - by = chr(215) if compat.PY3 else unichr(215) # × + by = chr(215) if compat.PY3 else unichr(215) # × self.write(u('

%d rows %s %d columns

') % - (len(frame), by, len(frame.columns)) ) + (len(frame), by, len(frame.columns))) _put_lines(buf, self.elements) def _write_header(self, indent): @@ -783,8 +784,9 @@ def _column_header(): align=align) if self.fmt.has_index_names: - row = [x if x is not None else '' for x in self.frame.index.names] \ - + [''] * min(len(self.columns), self.max_cols) + row = [ + x if x is not None else '' for x in self.frame.index.names + ] + [''] * min(len(self.columns), self.max_cols) self.write_tr(row, indent, self.indent_delta, header=True) indent -= self.indent_delta @@ -851,7 +853,7 @@ def _write_hierarchical_rows(self, fmt_values, indent): truncate = (len(frame) > self.max_rows) idx_values = frame.index[:nrows].format(sparsify=False, adjoin=False, - names=False) + names=False) idx_values = lzip(*idx_values) if self.fmt.sparsify: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d0a1511ec1cca..93587cd11b597 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -432,8 +432,9 @@ def _repr_fits_horizontal_(self, ignore_width=False): def _info_repr(self): """True if the repr should show the info view.""" info_repr_option = (get_option("display.large_repr") == "info") - return info_repr_option and not \ - (self._repr_fits_horizontal_() and self._repr_fits_vertical_()) + return info_repr_option and not ( + self._repr_fits_horizontal_() and self._repr_fits_vertical_() + ) def __unicode__(self): """ @@ -486,8 +487,7 @@ def _repr_html_(self): return ('
\n' + self.to_html(max_rows=max_rows, max_cols=max_cols, - show_dimensions=True) \ - + '\n
') + show_dimensions=True) + '\n') else: return None @@ -1283,7 +1283,8 @@ def to_string(self, buf=None, columns=None, col_space=None, colSpace=None, index_names=index_names, header=header, index=index, line_width=line_width, - max_rows=max_rows, max_cols=max_cols, + max_rows=max_rows, + max_cols=max_cols, show_dimensions=show_dimensions) formatter.to_string() @@ -1310,7 +1311,8 @@ def to_html(self, buf=None, columns=None, col_space=None, colSpace=None, escape : boolean, default True Convert the characters <, >, and & to HTML-safe sequences.= max_rows : int, optional - Maximum number of rows to show before truncating. If None, show all. + Maximum number of rows to show before truncating. If None, show + all. max_cols : int, optional Maximum number of columns to show before truncating. If None, show all. @@ -1336,7 +1338,8 @@ def to_html(self, buf=None, columns=None, col_space=None, colSpace=None, header=header, index=index, bold_rows=bold_rows, escape=escape, - max_rows=max_rows, max_cols=max_cols, + max_rows=max_rows, + max_cols=max_cols, show_dimensions=show_dimensions) formatter.to_html(classes=classes) @@ -1904,7 +1907,8 @@ def _ensure_valid_index(self, value): if not isinstance(value, Series): raise ValueError('Cannot set a frame with no defined index ' - 'and a value that cannot be converted to a Series') + 'and a value that cannot be converted to a ' + 'Series') self._data.set_axis(1, value.index.copy(), check_axis=False) def _set_item(self, key, value): @@ -4597,7 +4601,7 @@ def extract_index(data): def _prep_ndarray(values, copy=True): - if not isinstance(values, (np.ndarray,Series)): + if not isinstance(values, (np.ndarray, Series)): if len(values) == 0: return np.empty((0, 0), dtype=object) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4089b13fca5c7..624384e484dc0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -42,8 +42,8 @@ def is_dictlike(x): def _single_replace(self, to_replace, method, inplace, limit): if self.ndim != 1: - raise TypeError('cannot replace {0} with method {1} on a {2}'.format(to_replace, - method,type(self).__name__)) + raise TypeError('cannot replace {0} with method {1} on a {2}' + .format(to_replace, method, type(self).__name__)) orig_dtype = self.dtype result = self if inplace else self.copy() @@ -2047,7 +2047,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, # passing a single value that is scalar like # when value is None (GH5319), for compat if not is_dictlike(to_replace) and not is_dictlike(regex): - to_replace = [ to_replace ] + to_replace = [to_replace] if isinstance(to_replace, (tuple, list)): return _single_replace(self, to_replace, method, inplace, diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 7a7fe32963457..960baa503036c 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -649,9 +649,9 @@ def _index_with_as_index(self, b): original = self.obj.index gp = self.grouper levels = chain((gp.levels[i][gp.labels[i][b]] - for i in range(len(gp.groupings))), - (original.get_level_values(i)[b] - for i in range(original.nlevels))) + for i in range(len(gp.groupings))), + (original.get_level_values(i)[b] + for i in range(original.nlevels))) new = MultiIndex.from_arrays(list(levels)) new.names = gp.names + original.names return new @@ -2161,7 +2161,6 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): else: key_index = Index(keys, name=key_names[0]) - # make Nones an empty object if com._count_not_none(*values) != len(values): v = None @@ -2170,14 +2169,20 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): break if v is None: return DataFrame() - values = [ x if x is not None else v._constructor(**v._construct_axes_dict()) for x in values ] + values = [ + x if x is not None else + v._constructor(**v._construct_axes_dict()) + for x in values + ] v = values[0] if isinstance(v, (np.ndarray, Series)): if isinstance(v, Series): applied_index = self.obj._get_axis(self.axis) - all_indexed_same = _all_indexes_same([x.index for x in values ]) + all_indexed_same = _all_indexes_same([ + x.index for x in values + ]) singular_series = (len(values) == 1 and applied_index.nlevels == 1) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 08f935539ecfc..a4e273c43e483 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -830,7 +830,9 @@ def _reindex(keys, level=None): # see GH5553, make sure we use the right indexer new_indexer = np.arange(len(indexer)) - new_indexer[cur_indexer] = np.arange(len(result._get_axis(axis))) + new_indexer[cur_indexer] = np.arange( + len(result._get_axis(axis)) + ) new_indexer[missing_indexer] = -1 # we have a non_unique selector, need to use the original diff --git a/pandas/core/internals.py b/pandas/core/internals.py index e8b18ae93b287..471136dc2386b 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3480,7 +3480,10 @@ def _delete_from_block(self, i, item): super(SingleBlockManager, self)._delete_from_block(i, item) # reset our state - self._block = self.blocks[0] if len(self.blocks) else make_block(np.array([],dtype=self._block.dtype),[],[]) + self._block = ( + self.blocks[0] if len(self.blocks) else + make_block(np.array([], dtype=self._block.dtype), [], []) + ) self._values = self._block.values def get_slice(self, slobj, raise_on_error=False): diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index d421fa36326aa..1244d0140a01b 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -786,6 +786,7 @@ def lreshape(data, groups, dropna=True, label=None): return DataFrame(mdata, columns=id_cols + pivot_cols) + def wide_to_long(df, stubnames, i, j): """ Wide panel to long format. Less flexible but more user-friendly than melt. @@ -848,8 +849,8 @@ def get_var_names(df, regex): def melt_stub(df, stub, i, j): varnames = get_var_names(df, "^"+stub) - newdf = melt(df, id_vars=i, value_vars=varnames, - value_name=stub, var_name=j) + newdf = melt(df, id_vars=i, value_vars=varnames, value_name=stub, + var_name=j) newdf_j = newdf[j].str.replace(stub, "") try: newdf_j = newdf_j.astype(int) @@ -870,6 +871,7 @@ def melt_stub(df, stub, i, j): newdf = newdf.merge(new, how="outer", on=id_vars + [j], copy=False) return newdf.set_index([i, j]) + def convert_dummies(data, cat_variables, prefix_sep='_'): """ Compute DataFrame with specified columns converted to dummy variables (0 / diff --git a/pandas/io/auth.py b/pandas/io/auth.py index 15e3eb70d91b2..74b6b13000108 100644 --- a/pandas/io/auth.py +++ b/pandas/io/auth.py @@ -117,6 +117,7 @@ def init_service(http): """ return gapi.build('analytics', 'v3', http=http) + def reset_default_token_store(): import os os.remove(DEFAULT_TOKEN_FILE) diff --git a/pandas/io/clipboard.py b/pandas/io/clipboard.py index 13135d255d9e2..143b507c41c3f 100644 --- a/pandas/io/clipboard.py +++ b/pandas/io/clipboard.py @@ -2,6 +2,7 @@ from pandas import compat, get_option, DataFrame from pandas.compat import StringIO + def read_clipboard(**kwargs): # pragma: no cover """ Read text from clipboard and pass to read_table. See read_table for the @@ -20,7 +21,10 @@ def read_clipboard(**kwargs): # pragma: no cover # try to decode (if needed on PY3) if compat.PY3: try: - text = compat.bytes_to_str(text,encoding=kwargs.get('encoding') or get_option('display.encoding')) + text = compat.bytes_to_str( + text, encoding=(kwargs.get('encoding') or + get_option('display.encoding')) + ) except: pass return read_table(StringIO(text), **kwargs) @@ -58,7 +62,7 @@ def to_clipboard(obj, excel=None, sep=None, **kwargs): # pragma: no cover if sep is None: sep = '\t' buf = StringIO() - obj.to_csv(buf,sep=sep, **kwargs) + obj.to_csv(buf, sep=sep, **kwargs) clipboard_set(buf.getvalue()) return except: @@ -70,4 +74,3 @@ def to_clipboard(obj, excel=None, sep=None, **kwargs): # pragma: no cover else: objstr = str(obj) clipboard_set(objstr) - diff --git a/pandas/io/common.py b/pandas/io/common.py index 6b8186e253199..d6b2827f94d36 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -13,7 +13,8 @@ _urlopen = urlopen from urllib.parse import urlparse as parse_url import urllib.parse as compat_parse - from urllib.parse import uses_relative, uses_netloc, uses_params, urlencode, urljoin + from urllib.parse import (uses_relative, uses_netloc, uses_params, + urlencode, urljoin) from urllib.error import URLError from http.client import HTTPException else: @@ -72,8 +73,8 @@ def _is_s3_url(url): def maybe_read_encoded_stream(reader, encoding=None): - """ read an encoded stream from the reader and transform the bytes to unicode - if required based on the encoding + """read an encoded stream from the reader and transform the bytes to + unicode if required based on the encoding Parameters ---------- @@ -84,7 +85,7 @@ def maybe_read_encoded_stream(reader, encoding=None): ------- a tuple of (a stream of decoded bytes, the encoding which was used) - """ + """ if compat.PY3 or encoding is not None: # pragma: no cover if encoding: @@ -97,6 +98,7 @@ def maybe_read_encoded_stream(reader, encoding=None): encoding = None return reader, encoding + def get_filepath_or_buffer(filepath_or_buffer, encoding=None): """ If the filepath_or_buffer is a url, translate and return the buffer @@ -114,7 +116,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None): if _is_url(filepath_or_buffer): req = _urlopen(str(filepath_or_buffer)) - return maybe_read_encoded_stream(req,encoding) + return maybe_read_encoded_stream(req, encoding) if _is_s3_url(filepath_or_buffer): try: diff --git a/pandas/io/data.py b/pandas/io/data.py index cf49515cac576..a3968446930e8 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -469,6 +469,7 @@ def fetch_data(url, name): axis=1, join='outer') return df + def get_data_famafrench(name): # path of zip files zip_file_url = ('http://mba.tuck.dartmouth.edu/pages/faculty/' diff --git a/pandas/io/date_converters.py b/pandas/io/date_converters.py index ef92b8692c07f..3ffcef4b21552 100644 --- a/pandas/io/date_converters.py +++ b/pandas/io/date_converters.py @@ -26,7 +26,7 @@ def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col, minute_col = _maybe_cast(minute_col) second_col = _maybe_cast(second_col) return lib.try_parse_datetime_components(year_col, month_col, day_col, - hour_col, minute_col, second_col) + hour_col, minute_col, second_col) def generic_parser(parse_func, *cols): diff --git a/pandas/io/excel.py b/pandas/io/excel.py index b97c9da0b0d18..ad7c37fba4c2f 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -547,8 +547,8 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): colletter = get_column_letter(col) xcell = wks.cell("%s%s" % (colletter, row)) for field in style.__fields__: - xcell.style.__setattr__(field, \ - style.__getattribute__(field)) + xcell.style.__setattr__( + field, style.__getattribute__(field)) @classmethod def _convert_to_style(cls, style_dict): @@ -778,10 +778,10 @@ def _convert_to_style(self, style_dict, num_format_str=None): alignment = style_dict.get('alignment') if alignment: if (alignment.get('horizontal') - and alignment['horizontal'] == 'center'): + and alignment['horizontal'] == 'center'): xl_format.set_align('center') if (alignment.get('vertical') - and alignment['vertical'] == 'top'): + and alignment['vertical'] == 'top'): xl_format.set_align('top') # Map the cell borders to XlsxWriter border properties. diff --git a/pandas/io/ga.py b/pandas/io/ga.py index 4391b2637a837..f002994888932 100644 --- a/pandas/io/ga.py +++ b/pandas/io/ga.py @@ -48,8 +48,8 @@ %s """ % _QUERY_PARAMS -_GA_READER_DOC = """Given query parameters, return a DataFrame with all the data -or an iterator that returns DataFrames containing chunks of the data +_GA_READER_DOC = """Given query parameters, return a DataFrame with all the +data or an iterator that returns DataFrames containing chunks of the data Parameters ---------- @@ -89,12 +89,14 @@ Local host redirect if unspecified """ + def reset_token_store(): """ Deletes the default token store """ auth.reset_default_token_store() + @Substitution(extras=_AUTH_PARAMS) @Appender(_GA_READER_DOC) def read_ga(metrics, dimensions, start_date, **kwargs): @@ -185,9 +187,8 @@ def _init_service(self, secrets): return auth.init_service(http) def get_account(self, name=None, id=None, **kwargs): - """ - Retrieve an account that matches the name, id, or some account attribute - specified in **kwargs + """ Retrieve an account that matches the name, id, or some account + attribute specified in **kwargs Parameters ---------- @@ -385,6 +386,7 @@ def _maybe_add_arg(query, field, data, prefix='ga'): data = ','.join(['%s:%s' % (prefix, x) for x in data]) query[field] = data + def _get_match(obj_store, name, id, **kwargs): key, val = None, None if len(kwargs) > 0: diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 2d490ec071b4e..010277533589c 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -38,7 +38,8 @@ # These are some custom exceptions that the # to_gbq() method can throw -class SchemaMissing(PandasError,IOError): + +class SchemaMissing(PandasError, IOError): """ Raised when attempting to write a DataFrame to a new table in Google BigQuery without specifying @@ -46,14 +47,16 @@ class SchemaMissing(PandasError,IOError): """ pass -class InvalidSchema(PandasError,IOError): + +class InvalidSchema(PandasError, IOError): """ Raised when attempting to write a DataFrame to Google BigQuery with an invalid table schema. """ pass -class TableExistsFail(PandasError,IOError): + +class TableExistsFail(PandasError, IOError): """ Raised when attempting to write a DataFrame to an existing Google BigQuery table without specifying @@ -61,7 +64,8 @@ class TableExistsFail(PandasError,IOError): """ pass -class InvalidColumnOrder(PandasError,IOError): + +class InvalidColumnOrder(PandasError, IOError): """ Raised when the provided column order for output results DataFrame does not match the schema @@ -83,6 +87,7 @@ def _authenticate(): """ return bq.Client.Get() + def _parse_entry(field_value, field_type): """ Given a value and the corresponding BigQuery data type, @@ -147,10 +152,7 @@ def _parse_page(raw_page, col_names, col_types, col_dtypes): page_row_count = len(raw_page) # Place to hold the results for a page of data - page_array = np.zeros( - (page_row_count,), - dtype=zip(col_names,col_dtypes) - ) + page_array = np.zeros((page_row_count,), dtype=zip(col_names, col_dtypes)) for row_num, raw_row in enumerate(raw_page): entries = raw_row.get('f', []) # Iterate over each entry - setting proper field types @@ -163,6 +165,7 @@ def _parse_page(raw_page, col_names, col_types, col_dtypes): return page_array + def _parse_data(client, job, index_col=None, col_order=None): """ Iterate through the query results and piece together the @@ -196,9 +199,9 @@ def _parse_data(client, job, index_col=None, col_order=None): Notes: ----- - This script relies on Google being consistent with their + This script relies on Google being consistent with their pagination API. We are using the most flexible iteration method - that we could find in the bq.py/bigquery_client.py API's, but + that we could find in the bq.py/bigquery_client.py API's, but these have undergone large amounts of change recently. We have encountered bugs with this functionality, see: @@ -209,10 +212,11 @@ def _parse_data(client, job, index_col=None, col_order=None): # see: http://pandas.pydata.org/pandas-docs/dev/missing_data.html#missing-data-casting-rules-and-indexing dtype_map = {'INTEGER': np.dtype(float), 'FLOAT': np.dtype(float), - 'TIMESTAMP': 'M8[ns]'} # This seems to be buggy without nanosecond indicator + 'TIMESTAMP': 'M8[ns]'} # This seems to be buggy without + # nanosecond indicator # We first need the schema to get information about the columns of - # our dataframe. + # our dataframe. table_dict = job['configuration']['query']['destinationTable'] fields = client.GetTableSchema(table_dict)['fields'] @@ -226,23 +230,23 @@ def _parse_data(client, job, index_col=None, col_order=None): # TODO: Do this in one clean step for field in fields: col_types.append(field['type']) - # Note the encoding... numpy doesn't like titles that are UTF8, which is the return - # type from the API + # Note the encoding... numpy doesn't like titles that are UTF8, which + # is the return type from the API col_names.append(field['name'].encode('ascii', 'ignore')) - # Note, it would be nice to use 'str' types, but BigQuery doesn't have a fixed length - # in mind - just maxes out at 64k - col_dtypes.append(dtype_map.get(field['type'],object)) + # Note, it would be nice to use 'str' types, but BigQuery doesn't have + # a fixed length in mind - just maxes out at 64k + col_dtypes.append(dtype_map.get(field['type'], object)) - # How many columns are there num_columns = len(col_names) - + # Iterate over the result rows. # Since Google's API now requires pagination of results, - # we do that here. The following is repurposed from + # we do that here. The following is repurposed from # bigquery_client.py :: Client._JobTableReader._ReadOnePage - # TODO: Enable Reading From Table, see Client._TableTableReader._ReadOnePage + # TODO: Enable Reading From Table, + # see Client._TableTableReader._ReadOnePage # Initially, no page token is set page_token = None @@ -254,13 +258,12 @@ def _parse_data(client, job, index_col=None, col_order=None): total_rows = max_rows # This is the starting row for a particular page... - # is ignored if page_token is present, though + # is ignored if page_token is present, though # it may be useful if we wish to implement SQL like LIMITs # with minimums start_row = 0 - # Keep our page DataFrames until the end when we - # concatentate them + # Keep our page DataFrames until the end when we concatenate them dataframe_list = list() current_job = job['jobReference'] @@ -298,7 +301,8 @@ def _parse_data(client, job, index_col=None, col_order=None): start_row += len(raw_page) if total_rows > 0: completed = (100 * start_row) / total_rows - logger.info('Remaining Rows: ' + str(total_rows - start_row) + '(' + str(completed) + '% Complete)') + logger.info('Remaining Rows: ' + str(total_rows - start_row) + '(' + + str(completed) + '% Complete)') else: logger.info('No Rows') @@ -308,8 +312,9 @@ def _parse_data(client, job, index_col=None, col_order=None): # but we felt it was still a good idea. if not page_token and not raw_page and start_row != total_rows: raise bigquery_client.BigqueryInterfaceError( - ("Not enough rows returned by server. Expected: {0}" + \ - " Rows, But Recieved {1}").format(total_rows, start_row)) + 'Not enough rows returned by server. Expected: {0} Rows, But ' + 'Received {1}'.format(total_rows, start_row) + ) # Build final dataframe final_df = concat(dataframe_list, ignore_index=True) @@ -320,14 +325,19 @@ def _parse_data(client, job, index_col=None, col_order=None): final_df.set_index(index_col, inplace=True) col_names.remove(index_col) else: - raise InvalidColumnOrder('Index column "{0}" does not exist in DataFrame.'.format(index_col)) + raise InvalidColumnOrder( + 'Index column "{0}" does not exist in DataFrame.' + .format(index_col) + ) # Change the order of columns in the DataFrame based on provided list if col_order is not None: if sorted(col_order) == sorted(col_names): final_df = final_df[col_order] else: - raise InvalidColumnOrder('Column order does not match this DataFrame.') + raise InvalidColumnOrder( + 'Column order does not match this DataFrame.' + ) # Downcast floats to integers and objects to booleans # if there are no NaN's. This is presently due to a @@ -335,13 +345,15 @@ def _parse_data(client, job, index_col=None, col_order=None): final_df._data = final_df._data.downcast(dtypes='infer') return final_df -def to_gbq(dataframe, destination_table, schema=None, col_order=None, if_exists='fail', **kwargs): - """Write a DataFrame to a Google BigQuery table. - - If the table exists, the DataFrame will be appended. If not, a new table - will be created, in which case the schema will have to be specified. By default, - rows will be written in the order they appear in the DataFrame, though - the user may specify an alternative order. + +def to_gbq(dataframe, destination_table, schema=None, col_order=None, + if_exists='fail', **kwargs): + """Write a DataFrame to a Google BigQuery table. + + If the table exists, the DataFrame will be appended. If not, a new table + will be created, in which case the schema will have to be specified. By + default, rows will be written in the order they appear in the DataFrame, + though the user may specify an alternative order. Parameters ---------- @@ -350,9 +362,11 @@ def to_gbq(dataframe, destination_table, schema=None, col_order=None, if_exists= destination_table : string name of table to be written, in the form 'dataset.tablename' schema : sequence (optional) - list of column types in order for data to be inserted, e.g. ['INTEGER', 'TIMESTAMP', 'BOOLEAN'] + list of column types in order for data to be inserted, + e.g. ['INTEGER', 'TIMESTAMP', 'BOOLEAN'] col_order : sequence (optional) - order which columns are to be inserted, e.g. ['primary_key', 'birthday', 'username'] + order which columns are to be inserted, + e.g. ['primary_key', 'birthday', 'username'] if_exists : {'fail', 'replace', 'append'} (optional) - fail: If table exists, do nothing. - replace: If table exists, drop it, recreate it, and insert data. @@ -362,42 +376,50 @@ def to_gbq(dataframe, destination_table, schema=None, col_order=None, if_exists= Raises ------ SchemaMissing : - Raised if the 'if_exists' parameter is set to 'replace', but no schema is specified + Raised if the 'if_exists' parameter is set to 'replace', but no schema + is specified TableExists : - Raised if the specified 'destination_table' exists but the 'if_exists' parameter is set to 'fail' (the default) + Raised if the specified 'destination_table' exists but the 'if_exists' + parameter is set to 'fail' (the default) InvalidSchema : Raised if the 'schema' parameter does not match the provided DataFrame """ if not _BQ_INSTALLED: if sys.version_info >= (3, 0): - raise NotImplementedError('gbq module does not support Python 3 yet') + raise NotImplementedError('gbq module does not support Python 3 ' + 'yet') else: raise ImportError('Could not import Google BigQuery Client.') if not _BQ_VALID_VERSION: - raise ImportError("pandas requires bigquery >= 2.0.17 for Google BigQuery " - "support, current version " + _BQ_VERSION) + raise ImportError("pandas requires bigquery >= 2.0.17 for Google " + "BigQuery support, current version " + _BQ_VERSION) - ALLOWED_TYPES = ['STRING', 'INTEGER', 'FLOAT', 'BOOLEAN', 'TIMESTAMP', 'RECORD'] + ALLOWED_TYPES = ['STRING', 'INTEGER', 'FLOAT', 'BOOLEAN', 'TIMESTAMP', + 'RECORD'] if if_exists == 'replace' and schema is None: - raise SchemaMissing('Cannot replace a table without specifying the data schema') + raise SchemaMissing('Cannot replace a table without specifying the ' + 'data schema') else: client = _authenticate() table_reference = client.GetTableReference(destination_table) if client.TableExists(table_reference): if if_exists == 'fail': - raise TableExistsFail('Cannot overwrite existing tables if \'if_exists="fail"\'') + raise TableExistsFail('Cannot overwrite existing tables if ' + '\'if_exists="fail"\'') else: - # Build up a string representation of the + # Build up a string representation of the # table's schema. Since the table already # exists, we ask ask the API for it, which # is returned in a list of dictionaries # describing column data. Iterate over these # and build up a string of form: # "col_name1 : col_type1, col_name2 : col_type2..." - schema_full = client.GetTableSchema(dict(table_reference))['fields'] + schema_full = client.GetTableSchema( + dict(table_reference) + )['fields'] schema = '' for count, row in enumerate(schema_full): if count > 0: @@ -406,11 +428,13 @@ def to_gbq(dataframe, destination_table, schema=None, col_order=None, if_exists= else: logger.info('Creating New Table') if schema is None: - raise SchemaMissing('Cannot create a new table without specifying the data schema') + raise SchemaMissing('Cannot create a new table without ' + 'specifying the data schema') else: columns = dataframe.columns if len(schema) != len(columns): - raise InvalidSchema('Incorrect number of columns in schema') + raise InvalidSchema('Incorrect number of columns in ' + 'schema') else: schema_string = '' for count, name in enumerate(columns): @@ -420,7 +444,9 @@ def to_gbq(dataframe, destination_table, schema=None, col_order=None, if_exists= if column_type in ALLOWED_TYPES: schema_string += name + ':' + schema[count].lower() else: - raise InvalidSchema('Invalid Type: ' + column_type + ". Must be one of: " + str(ALLOWED_TYPES)) + raise InvalidSchema('Invalid Type: ' + column_type + + ". Must be one of: " + + str(ALLOWED_TYPES)) schema = schema_string opts = kwargs @@ -437,18 +463,22 @@ def to_gbq(dataframe, destination_table, schema=None, col_order=None, if_exists= with tempfile.NamedTemporaryFile() as csv_file: dataframe.to_csv(csv_file.name, index=False, encoding='utf-8') - job = client.Load(table_reference, csv_file.name, schema=schema, **opts) + job = client.Load(table_reference, csv_file.name, schema=schema, + **opts) -def read_gbq(query, project_id = None, destination_table = None, index_col=None, col_order=None, **kwargs): + +def read_gbq(query, project_id=None, destination_table=None, index_col=None, + col_order=None, **kwargs): """Load data from Google BigQuery. - - The main method a user calls to load data from Google BigQuery into a pandas DataFrame. - This is a simple wrapper for Google's bq.py and bigquery_client.py, which we use - to get the source data. Because of this, this script respects the user's bq settings - file, '~/.bigqueryrc', if it exists. Such a file can be generated using 'bq init'. Further, - additional parameters for the query can be specified as either ``**kwds`` in the command, - or using FLAGS provided in the 'gflags' module. Particular options can be found in - bigquery_client.py. + + The main method a user calls to load data from Google BigQuery into a + pandas DataFrame. This is a simple wrapper for Google's bq.py and + bigquery_client.py, which we use to get the source data. Because of this, + this script respects the user's bq settings file, '~/.bigqueryrc', if it + exists. Such a file can be generated using 'bq init'. Further, additional + parameters for the query can be specified as either ``**kwds`` in the + command, or using FLAGS provided in the 'gflags' module. Particular options + can be found in bigquery_client.py. Parameters ---------- @@ -464,8 +494,8 @@ def read_gbq(query, project_id = None, destination_table = None, index_col=None, DataFrame destination_table : string (optional) If provided, send the results to the given table. - **kwargs : - To be passed to bq.Client.Create(). Particularly: 'trace', + **kwargs : + To be passed to bq.Client.Create(). Particularly: 'trace', 'sync', 'api', 'api_version' Returns @@ -476,13 +506,14 @@ def read_gbq(query, project_id = None, destination_table = None, index_col=None, """ if not _BQ_INSTALLED: if sys.version_info >= (3, 0): - raise NotImplementedError('gbq module does not support Python 3 yet') + raise NotImplementedError('gbq module does not support Python 3 ' + 'yet') else: raise ImportError('Could not import Google BigQuery Client.') if not _BQ_VALID_VERSION: - raise ImportError("pandas requires bigquery >= 2.0.17 for Google BigQuery " - "support, current version " + _BQ_VERSION) + raise ImportError('pandas requires bigquery >= 2.0.17 for Google ' + 'BigQuery support, current version ' + _BQ_VERSION) query_args = kwargs query_args['project_id'] = project_id @@ -493,5 +524,5 @@ def read_gbq(query, project_id = None, destination_table = None, index_col=None, client = _authenticate() job = client.Query(**query_args) - + return _parse_data(client, job, index_col=index_col, col_order=col_order) diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 08299738f31a2..5d392e94106e9 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -49,7 +49,8 @@ from pandas.compat import u, PY3 from pandas import ( Timestamp, Period, Series, DataFrame, Panel, Panel4D, - Index, MultiIndex, Int64Index, PeriodIndex, DatetimeIndex, Float64Index, NaT + Index, MultiIndex, Int64Index, PeriodIndex, DatetimeIndex, Float64Index, + NaT ) from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel from pandas.sparse.array import BlockIndex, IntIndex @@ -87,7 +88,8 @@ def to_msgpack(path_or_buf, *args, **kwargs): args : an object or objects to serialize append : boolean whether to append to an existing msgpack (default is False) - compress : type of compressor (zlib or blosc), default to None (no compression) + compress : type of compressor (zlib or blosc), default to None (no + compression) """ global compressor compressor = kwargs.pop('compress', None) @@ -111,6 +113,7 @@ def writer(fh): else: writer(path_or_buf) + def read_msgpack(path_or_buf, iterator=False, **kwargs): """ Load msgpack pandas object from the specified @@ -153,7 +156,7 @@ def read(fh): return read(fh) # treat as a string-like - if not hasattr(path_or_buf,'read'): + if not hasattr(path_or_buf, 'read'): try: fh = compat.BytesIO(path_or_buf) @@ -230,6 +233,7 @@ def convert(values): # ndarray (on original dtype) return v.tostring() + def unconvert(values, dtype, compress=None): if dtype == np.object_: @@ -251,7 +255,8 @@ def unconvert(values, dtype, compress=None): return np.frombuffer(values, dtype=dtype) # from a string - return np.fromstring(values.encode('latin1'),dtype=dtype) + return np.fromstring(values.encode('latin1'), dtype=dtype) + def encode(obj): """ @@ -264,11 +269,11 @@ def encode(obj): return {'typ': 'period_index', 'klass': obj.__class__.__name__, 'name': getattr(obj, 'name', None), - 'freq': getattr(obj,'freqstr',None), + 'freq': getattr(obj, 'freqstr', None), 'dtype': obj.dtype.num, 'data': convert(obj.asi8)} elif isinstance(obj, DatetimeIndex): - tz = getattr(obj,'tz',None) + tz = getattr(obj, 'tz', None) # store tz info and data as UTC if tz is not None: @@ -279,8 +284,8 @@ def encode(obj): 'name': getattr(obj, 'name', None), 'dtype': obj.dtype.num, 'data': convert(obj.asi8), - 'freq': getattr(obj,'freqstr',None), - 'tz': tz } + 'freq': getattr(obj, 'freqstr', None), + 'tz': tz} elif isinstance(obj, MultiIndex): return {'typ': 'multi_index', 'klass': obj.__class__.__name__, @@ -295,7 +300,9 @@ def encode(obj): 'data': convert(obj.values)} elif isinstance(obj, Series): if isinstance(obj, SparseSeries): - raise NotImplementedError("msgpack sparse series is not implemented") + raise NotImplementedError( + 'msgpack sparse series is not implemented' + ) #d = {'typ': 'sparse_series', # 'klass': obj.__class__.__name__, # 'dtype': obj.dtype.num, @@ -316,7 +323,9 @@ def encode(obj): 'compress': compressor} elif issubclass(tobj, NDFrame): if isinstance(obj, SparseDataFrame): - raise NotImplementedError("msgpack sparse frame is not implemented") + raise NotImplementedError( + 'msgpack sparse frame is not implemented' + ) #d = {'typ': 'sparse_dataframe', # 'klass': obj.__class__.__name__, # 'columns': obj.columns} @@ -326,7 +335,9 @@ def encode(obj): # for name, ss in compat.iteritems(obj)]) #return d elif isinstance(obj, SparsePanel): - raise NotImplementedError("msgpack sparse frame is not implemented") + raise NotImplementedError( + 'msgpack sparse frame is not implemented' + ) #d = {'typ': 'sparse_panel', # 'klass': obj.__class__.__name__, # 'items': obj.items} @@ -353,7 +364,8 @@ def encode(obj): 'compress': compressor } for b in data.blocks]} - elif isinstance(obj, (datetime, date, np.datetime64, timedelta, np.timedelta64)): + elif isinstance(obj, (datetime, date, np.datetime64, timedelta, + np.timedelta64)): if isinstance(obj, Timestamp): tz = obj.tzinfo if tz is not None: @@ -436,18 +448,22 @@ def decode(obj): return Period(ordinal=obj['ordinal'], freq=obj['freq']) elif typ == 'index': dtype = dtype_for(obj['dtype']) - data = unconvert(obj['data'], np.typeDict[obj['dtype']], obj.get('compress')) + data = unconvert(obj['data'], np.typeDict[obj['dtype']], + obj.get('compress')) return globals()[obj['klass']](data, dtype=dtype, name=obj['name']) elif typ == 'multi_index': - data = unconvert(obj['data'], np.typeDict[obj['dtype']], obj.get('compress')) - data = [ tuple(x) for x in data ] + data = unconvert(obj['data'], np.typeDict[obj['dtype']], + obj.get('compress')) + data = [tuple(x) for x in data] return globals()[obj['klass']].from_tuples(data, names=obj['names']) elif typ == 'period_index': data = unconvert(obj['data'], np.int64, obj.get('compress')) - return globals()[obj['klass']](data, name=obj['name'], freq=obj['freq']) + return globals()[obj['klass']](data, name=obj['name'], + freq=obj['freq']) elif typ == 'datetime_index': data = unconvert(obj['data'], np.int64, obj.get('compress')) - result = globals()[obj['klass']](data, freq=obj['freq'], name=obj['name']) + result = globals()[obj['klass']](data, freq=obj['freq'], + name=obj['name']) tz = obj['tz'] # reverse tz conversion @@ -457,13 +473,17 @@ def decode(obj): elif typ == 'series': dtype = dtype_for(obj['dtype']) index = obj['index'] - return globals()[obj['klass']](unconvert(obj['data'], dtype, obj['compress']), index=index, name=obj['name']) + return globals()[obj['klass']](unconvert(obj['data'], dtype, + obj['compress']), + index=index, name=obj['name']) elif typ == 'block_manager': axes = obj['axes'] def create_block(b): dtype = dtype_for(b['dtype']) - return make_block(unconvert(b['values'], dtype, b['compress']).reshape(b['shape']), b['items'], axes[0], klass=getattr(internals, b['klass'])) + return make_block(unconvert(b['values'], dtype, b['compress']) + .reshape(b['shape']), b['items'], axes[0], + klass=getattr(internals, b['klass'])) blocks = [create_block(b) for b in obj['blocks']] return globals()[obj['klass']](BlockManager(blocks, axes)) @@ -479,21 +499,29 @@ def create_block(b): return np.timedelta64(int(obj['data'])) #elif typ == 'sparse_series': # dtype = dtype_for(obj['dtype']) - # return globals( - # )[obj['klass']](unconvert(obj['sp_values'], dtype, obj['compress']), sparse_index=obj['sp_index'], - # index=obj['index'], fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name']) + # return globals()[obj['klass']]( + # unconvert(obj['sp_values'], dtype, obj['compress']), + # sparse_index=obj['sp_index'], index=obj['index'], + # fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name']) #elif typ == 'sparse_dataframe': - # return globals()[obj['klass']](obj['data'], - # columns=obj['columns'], default_fill_value=obj['default_fill_value'], default_kind=obj['default_kind']) + # return globals()[obj['klass']]( + # obj['data'], columns=obj['columns'], + # default_fill_value=obj['default_fill_value'], + # default_kind=obj['default_kind'] + # ) #elif typ == 'sparse_panel': - # return globals()[obj['klass']](obj['data'], - # items=obj['items'], default_fill_value=obj['default_fill_value'], default_kind=obj['default_kind']) + # return globals()[obj['klass']]( + # obj['data'], items=obj['items'], + # default_fill_value=obj['default_fill_value'], + # default_kind=obj['default_kind']) elif typ == 'block_index': - return globals()[obj['klass']](obj['length'], obj['blocs'], obj['blengths']) + return globals()[obj['klass']](obj['length'], obj['blocs'], + obj['blengths']) elif typ == 'int_index': return globals()[obj['klass']](obj['length'], obj['indices']) elif typ == 'ndarray': - return unconvert(obj['data'], np.typeDict[obj['dtype']], obj.get('compress')).reshape(obj['shape']) + return unconvert(obj['data'], np.typeDict[obj['dtype']], + obj.get('compress')).reshape(obj['shape']) elif typ == 'np_scalar': if obj.get('sub_typ') == 'np_complex': return c2f(obj['real'], obj['imag'], obj['dtype']) @@ -585,7 +613,7 @@ def __iter__(self): try: path_exists = os.path.exists(self.path) - except (TypeError): + except TypeError: path_exists = False if path_exists: @@ -595,7 +623,7 @@ def __iter__(self): else: - if not hasattr(self.path,'read'): + if not hasattr(self.path, 'read'): fh = compat.BytesIO(self.path) else: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index e62ecd5a541df..bd0649a7a85f3 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -30,14 +30,15 @@ Parameters ---------- filepath_or_buffer : string or file handle / StringIO. The string could be - a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host - is expected. For instance, a local file could be + a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a + host is expected. For instance, a local file could be file ://localhost/path/to/table.csv %s lineterminator : string (length 1), default None Character to break file into lines. Only valid with C parser quotechar : string - The character to used to denote the start and end of a quoted item. Quoted items can include the delimiter and it will be ignored. + The character to used to denote the start and end of a quoted item. Quoted + items can include the delimiter and it will be ignored. quoting : int Controls whether quotes should be recognized. Values are taken from `csv.QUOTE_*` values. Acceptable values are 0, 1, 2, and 3 for @@ -55,9 +56,9 @@ header : int row number(s) to use as the column names, and the start of the data. Defaults to 0 if no ``names`` passed, otherwise ``None``. Explicitly pass ``header=0`` to be able to replace existing names. The header can be - a list of integers that specify row locations for a multi-index on the columns - E.g. [0,1,3]. Intervening rows that are not specified will be skipped. - (E.g. 2 in this example are skipped) + a list of integers that specify row locations for a multi-index on the + columns E.g. [0,1,3]. Intervening rows that are not specified will be + skipped. (E.g. 2 in this example are skipped) skiprows : list-like or integer Row numbers to skip (0-indexed) or number of rows to skip (int) at the start of the file @@ -251,7 +252,7 @@ def _read(filepath_or_buffer, kwds): 'squeeze': False, 'compression': None, 'mangle_dupe_cols': True, - 'tupleize_cols':False, + 'tupleize_cols': False, } @@ -437,9 +438,10 @@ def read_fwf(filepath_or_buffer, colspecs='infer', widths=None, **kwds): # common NA values # no longer excluding inf representations # '1.#INF','-1.#INF', '1.#INF000000', -_NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', - '#N/A','N/A', 'NA', '#NA', 'NULL', 'NaN', - 'nan', '']) +_NA_VALUES = set([ + '-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A', 'N/A', 'NA', '#NA', + 'NULL', 'NaN', 'nan', '' +]) class TextFileReader(object): @@ -653,14 +655,14 @@ def __init__(self, kwds): self.na_fvalues = kwds.get('na_fvalues') self.true_values = kwds.get('true_values') self.false_values = kwds.get('false_values') - self.tupleize_cols = kwds.get('tupleize_cols',False) + self.tupleize_cols = kwds.get('tupleize_cols', False) self._date_conv = _make_date_converter(date_parser=self.date_parser, dayfirst=self.dayfirst) # validate header options for mi self.header = kwds.get('header') - if isinstance(self.header,(list,tuple,np.ndarray)): + if isinstance(self.header, (list, tuple, np.ndarray)): if kwds.get('as_recarray'): raise ValueError("cannot specify as_recarray when " "specifying a multi-index header") @@ -702,7 +704,8 @@ def _should_parse_dates(self, i): else: return (j in self.parse_dates) or (name in self.parse_dates) - def _extract_multi_indexer_columns(self, header, index_names, col_names, passed_names=False): + def _extract_multi_indexer_columns(self, header, index_names, col_names, + passed_names=False): """ extract and return the names, index_names, col_names header is a list-of-lists returned from the parsers """ if len(header) < 2: @@ -715,8 +718,8 @@ def _extract_multi_indexer_columns(self, header, index_names, col_names, passed_ if ic is None: ic = [] - if not isinstance(ic, (list,tuple,np.ndarray)): - ic = [ ic ] + if not isinstance(ic, (list, tuple, np.ndarray)): + ic = [ic] sic = set(ic) # clean the index_names @@ -726,22 +729,29 @@ def _extract_multi_indexer_columns(self, header, index_names, col_names, passed_ # extract the columns field_count = len(header[0]) + def extract(r): - return tuple([ r[i] for i in range(field_count) if i not in sic ]) - columns = lzip(*[ extract(r) for r in header ]) + return tuple([r[i] for i in range(field_count) if i not in sic]) + + columns = lzip(*[extract(r) for r in header]) names = ic + columns - # if we find 'Unnamed' all of a single level, then our header was too long + # if we find 'Unnamed' all of a single level, then our header was too + # long for n in range(len(columns[0])): - if all([ 'Unnamed' in c[n] for c in columns ]): - raise _parser.CParserError("Passed header=[%s] are too many rows for this " - "multi_index of columns" % ','.join([ str(x) for x in self.header ])) + if all(['Unnamed' in c[n] for c in columns]): + raise _parser.CParserError( + "Passed header=[%s] are too many rows for this " + "multi_index of columns" + % ','.join([str(x) for x in self.header]) + ) # clean the column names (if we have an index_col) if len(ic): - col_names = [ r[0] if len(r[0]) and 'Unnamed' not in r[0] else None for r in header ] + col_names = [r[0] if len(r[0]) and 'Unnamed' not in r[0] else None + for r in header] else: - col_names = [ None ] * len(header) + col_names = [None] * len(header) passed_names = True @@ -749,9 +759,10 @@ def extract(r): def _maybe_make_multi_index_columns(self, columns, col_names=None): # possibly create a column mi here - if not self.tupleize_cols and len(columns) and not isinstance( - columns, MultiIndex) and all([ isinstance(c,tuple) for c in columns]): - columns = MultiIndex.from_tuples(columns,names=col_names) + if (not self.tupleize_cols and len(columns) and + not isinstance(columns, MultiIndex) and + all([isinstance(c, tuple) for c in columns])): + columns = MultiIndex.from_tuples(columns, names=col_names) return columns def _make_index(self, data, alldata, columns, indexnamerow=False): @@ -849,9 +860,8 @@ def _agg_index(self, index, try_parse_dates=True): if isinstance(self.na_values, dict): col_name = self.index_names[i] if col_name is not None: - col_na_values, col_na_fvalues = _get_na_values(col_name, - self.na_values, - self.na_fvalues) + col_na_values, col_na_fvalues = _get_na_values( + col_name, self.na_values, self.na_fvalues) arr, _ = self._convert_types(arr, col_na_values | col_na_fvalues) arrays.append(arr) @@ -865,14 +875,14 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, result = {} for c, values in compat.iteritems(dct): conv_f = None if converters is None else converters.get(c, None) - col_na_values, col_na_fvalues = _get_na_values(c, na_values, na_fvalues) + col_na_values, col_na_fvalues = _get_na_values(c, na_values, + na_fvalues) coerce_type = True if conv_f is not None: values = lib.map_infer(values, conv_f) coerce_type = False - cvals, na_count = self._convert_types(values, - set(col_na_values) | col_na_fvalues, - coerce_type) + cvals, na_count = self._convert_types( + values, set(col_na_values) | col_na_fvalues, coerce_type) result[c] = cvals if verbose and na_count: print('Filled %d NA values in column %s' % (na_count, str(c))) @@ -951,8 +961,12 @@ def __init__(self, src, **kwds): else: if len(self._reader.header) > 1: # we have a multi index in the columns - self.names, self.index_names, self.col_names, passed_names = self._extract_multi_indexer_columns( - self._reader.header, self.index_names, self.col_names, passed_names) + self.names, self.index_names, self.col_names, passed_names = ( + self._extract_multi_indexer_columns( + self._reader.header, self.index_names, self.col_names, + passed_names + ) + ) else: self.names = list(self._reader.header[0]) @@ -963,8 +977,9 @@ def __init__(self, src, **kwds): else: self.names = lrange(self._reader.table_width) - # If the names were inferred (not passed by user) and usedcols is defined, - # then ensure names refers to the used columns, not the document's columns. + # If the names were inferred (not passed by user) and usedcols is + # defined, then ensure names refers to the used columns, not the + # document's columns. if self.usecols and passed_names: col_indices = [] for u in self.usecols: @@ -972,7 +987,8 @@ def __init__(self, src, **kwds): col_indices.append(self.names.index(u)) else: col_indices.append(u) - self.names = [n for i, n in enumerate(self.names) if i in col_indices] + self.names = [n for i, n in enumerate(self.names) + if i in col_indices] if len(self.names) < len(self.usecols): raise ValueError("Usecols do not match names.") @@ -982,11 +998,12 @@ def __init__(self, src, **kwds): if not self._has_complex_date_col: if (self._reader.leading_cols == 0 and - _is_index_col(self.index_col)): + _is_index_col(self.index_col)): self._name_processed = True (index_names, self.names, - self.index_col) = _clean_index_names(self.names, self.index_col) + self.index_col) = _clean_index_names(self.names, + self.index_col) if self.index_names is None: self.index_names = index_names @@ -1265,8 +1282,11 @@ def __init__(self, f, **kwds): # The original set is stored in self.original_columns. if len(self.columns) > 1: # we are processing a multi index column - self.columns, self.index_names, self.col_names, _ = self._extract_multi_indexer_columns( - self.columns, self.index_names, self.col_names) + self.columns, self.index_names, self.col_names, _ = ( + self._extract_multi_indexer_columns( + self.columns, self.index_names, self.col_names + ) + ) # Update list of original names to include all indices. self.num_original_columns = len(self.columns) else: @@ -1291,7 +1311,8 @@ def __init__(self, f, **kwds): self._no_thousands_columns = None def _set_no_thousands_columns(self): - # Create a set of column ids that are not to be stripped of thousands operators. + # Create a set of column ids that are not to be stripped of thousands + # operators. noconvert_columns = set() def _set(x): @@ -1478,7 +1499,8 @@ def _infer_columns(self): for i, c in enumerate(line): if c == '': if have_mi_columns: - this_columns.append('Unnamed: %d_level_%d' % (i, level)) + this_columns.append('Unnamed: %d_level_%d' + % (i, level)) else: this_columns.append('Unnamed: %d' % i) unnamed_count += 1 @@ -1494,16 +1516,17 @@ def _infer_columns(self): counts[col] = cur_count + 1 elif have_mi_columns: - # if we have grabbed an extra line, but its not in our format - # so save in the buffer, and create an blank extra line for the rest of the - # parsing code + # if we have grabbed an extra line, but its not in our + # format so save in the buffer, and create an blank extra + # line for the rest of the parsing code if hr == header[-1]: lc = len(this_columns) - ic = len(self.index_col) if self.index_col is not None else 0 + ic = (len(self.index_col) + if self.index_col is not None else 0) if lc != unnamed_count and lc-ic > unnamed_count: clear_buffer = False - this_columns = [ None ] * lc - self.buf = [ self.buf[-1] ] + this_columns = [None] * lc + self.buf = [self.buf[-1]] columns.append(this_columns) if len(columns) == 1: @@ -1513,17 +1536,19 @@ def _infer_columns(self): self._clear_buffer() if names is not None: - if (self.usecols is not None and len(names) != len(self.usecols)) \ - or (self.usecols is None and len(names) != len(columns[0])): - + if ((self.usecols is not None + and len(names) != len(self.usecols)) + or (self.usecols is None + and len(names) != len(columns[0]))): raise ValueError('Number of passed names did not match ' - 'number of header fields in the file') + 'number of header fields in the file') if len(columns) > 1: raise TypeError('Cannot pass names with multi-index ' 'columns') if self.usecols is not None: - # Set _use_cols. We don't store columns because they are overwritten. + # Set _use_cols. We don't store columns because they are + # overwritten. self._handle_usecols(columns, names) else: self._col_indices = None @@ -1538,9 +1563,9 @@ def _infer_columns(self): num_original_columns = ncols if not names: if self.prefix: - columns = [ ['X%d' % i for i in range(ncols)] ] + columns = [['X%d' % i for i in range(ncols)]] else: - columns = [ lrange(ncols) ] + columns = [lrange(ncols)] columns = self._handle_usecols(columns, columns[0]) else: if self.usecols is None or len(names) == num_original_columns: @@ -1548,8 +1573,10 @@ def _infer_columns(self): num_original_columns = len(names) else: if self.usecols and len(names) != len(self.usecols): - raise ValueError('Number of passed names did not match ' - 'number of header fields in the file') + raise ValueError( + 'Number of passed names did not match number of ' + 'header fields in the file' + ) # Ignore output but set used columns. self._handle_usecols([names], names) columns = [names] @@ -1566,7 +1593,8 @@ def _handle_usecols(self, columns, usecols_key): if self.usecols is not None: if any([isinstance(u, string_types) for u in self.usecols]): if len(columns) > 1: - raise ValueError("If using multiple headers, usecols must be integers.") + raise ValueError("If using multiple headers, usecols must " + "be integers.") col_indices = [] for u in self.usecols: if isinstance(u, string_types): @@ -1576,7 +1604,8 @@ def _handle_usecols(self, columns, usecols_key): else: col_indices = self.usecols - columns = [[n for i, n in enumerate(column) if i in col_indices] for column in columns] + columns = [[n for i, n in enumerate(column) if i in col_indices] + for column in columns] self._col_indices = col_indices return columns @@ -1640,8 +1669,9 @@ def _check_thousands(self, lines): for i, x in enumerate(l): if (not isinstance(x, compat.string_types) or self.thousands not in x or - (self._no_thousands_columns and i in self._no_thousands_columns) or - nonnum.search(x.strip())): + (self._no_thousands_columns + and i in self._no_thousands_columns) + or nonnum.search(x.strip())): rl.append(x) else: rl.append(x.replace(self.thousands, '')) @@ -1746,9 +1776,14 @@ def _rows_to_cols(self, content): if self.usecols: if self._implicit_index: - zipped_content = [a for i, a in enumerate(zipped_content) if i < len(self.index_col) or i - len(self.index_col) in self._col_indices] + zipped_content = [ + a for i, a in enumerate(zipped_content) + if (i < len(self.index_col) + or i - len(self.index_col) in self._col_indices) + ] else: - zipped_content = [a for i, a in enumerate(zipped_content) if i in self._col_indices] + zipped_content = [a for i, a in enumerate(zipped_content) + if i in self._col_indices] return zipped_content def _get_lines(self, rows=None): @@ -1802,8 +1837,8 @@ def _get_lines(self, rows=None): except csv.Error as inst: if 'newline inside string' in str(inst): row_num = str(self.pos + rows) - msg = ('EOF inside string starting with line ' - + row_num) + msg = ('EOF inside string starting with ' + 'line ' + row_num) raise Exception(msg) raise except StopIteration: @@ -1948,7 +1983,9 @@ def _clean_na_values(na_values, keep_default_na=True): for k, v in compat.iteritems(na_values): v = set(list(v)) | _NA_VALUES na_values[k] = v - na_fvalues = dict([ (k, _floatify_na_values(v)) for k, v in na_values.items() ]) + na_fvalues = dict([ + (k, _floatify_na_values(v)) for k, v in na_values.items() + ]) else: if not com.is_list_like(na_values): na_values = [na_values] @@ -1987,7 +2024,8 @@ def _clean_index_names(columns, index_col): index_names.append(name) # hack - if isinstance(index_names[0], compat.string_types) and 'Unnamed' in index_names[0]: + if isinstance(index_names[0], compat.string_types)\ + and 'Unnamed' in index_names[0]: index_names[0] = None return index_names, columns, index_col @@ -2071,10 +2109,13 @@ def _get_col_names(colspec, columns): def _concat_date_cols(date_cols): if len(date_cols) == 1: if compat.PY3: - return np.array([compat.text_type(x) for x in date_cols[0]], dtype=object) + return np.array([compat.text_type(x) for x in date_cols[0]], + dtype=object) else: - return np.array([str(x) if not isinstance(x, compat.string_types) else x - for x in date_cols[0]], dtype=object) + return np.array([ + str(x) if not isinstance(x, compat.string_types) else x + for x in date_cols[0] + ], dtype=object) rs = np.array([' '.join([compat.text_type(y) for y in x]) for x in zip(*date_cols)], dtype=object) @@ -2101,9 +2142,9 @@ def __init__(self, f, colspecs, delimiter, comment): for colspec in self.colspecs: if not (isinstance(colspec, (tuple, list)) and - len(colspec) == 2 and - isinstance(colspec[0], (int, np.integer)) and - isinstance(colspec[1], (int, np.integer))): + len(colspec) == 2 and + isinstance(colspec[0], (int, np.integer)) and + isinstance(colspec[1], (int, np.integer))): raise TypeError('Each column specification must be ' '2 element tuple or list of integers') diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 97633873e7b40..915c1e9ae1574 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -1,5 +1,6 @@ from pandas.compat import cPickle as pkl, pickle_compat as pc, PY3 + def to_pickle(obj, path): """ Pickle (serialize) object to input file path @@ -19,8 +20,8 @@ def read_pickle(path): Load pickled pandas object (or any other pickled object) from the specified file path - Warning: Loading pickled data received from untrusted sources can be unsafe. - See: http://docs.python.org/2.7/library/pickle.html + Warning: Loading pickled data received from untrusted sources can be + unsafe. See: http://docs.python.org/2.7/library/pickle.html Parameters ---------- @@ -38,10 +39,10 @@ def try_read(path, encoding=None): # pass encoding only if its not None as py2 doesn't handle # the param try: - with open(path,'rb') as fh: + with open(path, 'rb') as fh: return pc.load(fh, encoding=encoding, compat=False) except: - with open(path,'rb') as fh: + with open(path, 'rb') as fh: return pc.load(fh, encoding=encoding, compat=True) try: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index db2028c70dc20..6ebc33afdd43d 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -74,10 +74,11 @@ def _ensure_term(where): create the terms here with a frame_level=2 (we are 2 levels down) """ - # only consider list/tuple here as an ndarray is automaticaly a coordinate list - if isinstance(where, (list,tuple)): + # only consider list/tuple here as an ndarray is automaticaly a coordinate + # list + if isinstance(where, (list, tuple)): where = [w if not maybe_expression(w) else Term(w, scope_level=2) - for w in where if w is not None ] + for w in where if w is not None] elif maybe_expression(where): where = Term(where, scope_level=2) return where @@ -124,11 +125,11 @@ class DuplicateWarning(Warning): # formats _FORMAT_MAP = { - u('f') : 'fixed', - u('fixed') : 'fixed', - u('t') : 'table', - u('table') : 'table', - } + u('f'): 'fixed', + u('fixed'): 'fixed', + u('t'): 'table', + u('table'): 'table', +} format_deprecate_doc = """ the table keyword has been deprecated @@ -169,7 +170,7 @@ class DuplicateWarning(Warning): # table class map _TABLE_MAP = { u('generic_table'): 'GenericTable', - u('appendable_series') : 'AppendableSeriesTable', + u('appendable_series'): 'AppendableSeriesTable', u('appendable_multiseries'): 'AppendableMultiSeriesTable', u('appendable_frame'): 'AppendableFrameTable', u('appendable_multiframe'): 'AppendableMultiFrameTable', @@ -202,8 +203,10 @@ class DuplicateWarning(Warning): with config.config_prefix('io.hdf'): config.register_option('dropna_table', True, dropna_doc, validator=config.is_bool) - config.register_option('default_format', None, format_doc, - validator=config.is_one_of_factory(['fixed','table',None])) + config.register_option( + 'default_format', None, format_doc, + validator=config.is_one_of_factory(['fixed', 'table', None]) + ) # oh the troubles to reduce import time _table_mod = None @@ -271,7 +274,7 @@ def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, def read_hdf(path_or_buf, key, **kwargs): - """ read from the store, closeit if we opened it + """ read from the store, close it if we opened it Retrieve pandas object stored in file, optionally based on where criteria @@ -281,12 +284,16 @@ def read_hdf(path_or_buf, key, **kwargs): path_or_buf : path (string), or buffer to read from key : group identifier in the store where : list of Term (or convertable) objects, optional - start : optional, integer (defaults to None), row number to start selection - stop : optional, integer (defaults to None), row number to stop selection - columns : optional, a list of columns that if not None, will limit the return columns + start : optional, integer (defaults to None), row number to start + selection + stop : optional, integer (defaults to None), row number to stop + selection + columns : optional, a list of columns that if not None, will limit the + return columns iterator : optional, boolean, return an iterator, default False chunksize : optional, nrows to include in iteration, return an iterator - auto_close : optional, boolean, should automatically close the store when finished, default is False + auto_close : optional, boolean, should automatically close the store + when finished, default is False Returns ------- @@ -442,8 +449,8 @@ def __unicode__(self): pprint_thing(s or 'invalid_HDFStore node')) except Exception as detail: keys.append(k) - values.append( - "[invalid_HDFStore node: %s]" % pprint_thing(detail)) + values.append("[invalid_HDFStore node: %s]" + % pprint_thing(detail)) output += adjoin(12, keys, values) else: @@ -456,7 +463,8 @@ def __unicode__(self): def keys(self): """ Return a (potentially unordered) list of the keys corresponding to the - objects stored in the HDFStore. These are ABSOLUTE path-names (e.g. have the leading '/' + objects stored in the HDFStore. These are ABSOLUTE path-names (e.g. + have the leading '/' """ return [n._v_pathname for n in self.groups()] @@ -482,15 +490,18 @@ def open(self, mode='a', **kwargs): if self._mode != mode: - # if we are chaning a write mode to read, ok + # if we are changing a write mode to read, ok if self._mode in ['a', 'w'] and mode in ['r', 'r+']: pass elif mode in ['w']: # this would truncate, raise here if self.is_open: - raise PossibleDataLossError("Re-opening the file [{0}] with mode [{1}] " - "will delete the current file!".format(self._path, self._mode)) + raise PossibleDataLossError( + "Re-opening the file [{0}] with mode [{1}] " + "will delete the current file!" + .format(self._path, self._mode) + ) self._mode = mode @@ -588,10 +599,12 @@ def select(self, key, where=None, start=None, stop=None, columns=None, where : list of Term (or convertable) objects, optional start : integer (defaults to None), row number to start selection stop : integer (defaults to None), row number to stop selection - columns : a list of columns that if not None, will limit the return columns + columns : a list of columns that if not None, will limit the return + columns iterator : boolean, return an iterator, default False chunksize : nrows to include in iteration, return an iterator - auto_close : boolean, should automatically close the store when finished, default is False + auto_close : boolean, should automatically close the store when + finished, default is False Returns ------- @@ -636,16 +649,20 @@ def select_as_coordinates( stop : integer (defaults to None), row number to stop selection """ where = _ensure_term(where) - return self.get_storer(key).read_coordinates(where=where, start=start, stop=stop, **kwargs) + return self.get_storer(key).read_coordinates(where=where, start=start, + stop=stop, **kwargs) def unique(self, key, column, **kwargs): warnings.warn("unique(key,column) is deprecated\n" - "use select_column(key,column).unique() instead",FutureWarning) - return self.get_storer(key).read_column(column=column, **kwargs).unique() + "use select_column(key,column).unique() instead", + FutureWarning) + return self.get_storer(key).read_column(column=column, + **kwargs).unique() def select_column(self, key, column, **kwargs): """ - return a single column from the table. This is generally only useful to select an indexable + return a single column from the table. This is generally only useful to + select an indexable Parameters ---------- @@ -654,8 +671,10 @@ def select_column(self, key, column, **kwargs): Exceptions ---------- - raises KeyError if the column is not found (or key is not a valid store) - raises ValueError if the column can not be extracted indivually (it is part of a data block) + raises KeyError if the column is not found (or key is not a valid + store) + raises ValueError if the column can not be extracted individually (it + is part of a data block) """ return self.get_storer(key).read_column(column=column, **kwargs) @@ -668,7 +687,8 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, Parameters ---------- keys : a list of the tables - selector : the table to apply the where criteria (defaults to keys[0] if not supplied) + selector : the table to apply the where criteria (defaults to keys[0] + if not supplied) columns : the columns I want back start : integer (defaults to None), row number to start selection stop : integer (defaults to None), row number to stop selection @@ -677,7 +697,8 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, Exceptions ---------- - raise if any of the keys don't refer to tables or if they are not ALL THE SAME DIMENSIONS + raise if any of the keys don't refer to tables or if they are not ALL + THE SAME DIMENSIONS """ # default to single select @@ -708,8 +729,9 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, raise TypeError("Invalid table [%s]" % k) if not t.is_table: raise TypeError( - "object [%s] is not a table, and cannot be used in all select as multiple" % - t.pathname) + "object [%s] is not a table, and cannot be used in all " + "select as multiple" % t.pathname + ) if nrows is None: nrows = t.nrows @@ -735,12 +757,16 @@ def func(_start, _stop): axis = list(set([t.non_index_axes[0][0] for t in tbls]))[0] # concat and return - return concat(objs, axis=axis, verify_integrity=False).consolidate() + return concat(objs, axis=axis, + verify_integrity=False).consolidate() if iterator or chunksize is not None: - return TableIterator(self, func, nrows=nrows, start=start, stop=stop, chunksize=chunksize, auto_close=auto_close) + return TableIterator(self, func, nrows=nrows, start=start, + stop=stop, chunksize=chunksize, + auto_close=auto_close) - return TableIterator(self, func, nrows=nrows, start=start, stop=stop, auto_close=auto_close).get_values() + return TableIterator(self, func, nrows=nrows, start=start, stop=stop, + auto_close=auto_close).get_values() def put(self, key, value, format=None, append=False, **kwargs): """ @@ -754,11 +780,12 @@ def put(self, key, value, format=None, append=False, **kwargs): fixed(f) : Fixed format Fast writing/reading. Not-appendable, nor searchable table(t) : Table format - Write as a PyTables Table structure which may perform worse but - allow more flexible operations like searching / selecting subsets - of the data + Write as a PyTables Table structure which may perform + worse but allow more flexible operations like searching + / selecting subsets of the data append : boolean, default False - This will force Table format, append the input data to the existing. + This will force Table format, append the input data to the + existing. encoding : default None, provide an encoding for strings """ if format is None: @@ -816,7 +843,8 @@ def remove(self, key, where=None, start=None, stop=None): 'can only remove with where on objects written as tables') return s.delete(where=where, start=start, stop=stop) - def append(self, key, value, format=None, append=True, columns=None, dropna=None, **kwargs): + def append(self, key, value, format=None, append=True, columns=None, + dropna=None, **kwargs): """ Append to Table in file. Node must already exist and be Table format. @@ -827,18 +855,20 @@ def append(self, key, value, format=None, append=True, columns=None, dropna=None value : {Series, DataFrame, Panel, Panel4D} format: 'table' is the default table(t) : table format - Write as a PyTables Table structure which may perform worse but - allow more flexible operations like searching / selecting subsets - of the data - append : boolean, default True, append the input data to the existing - data_columns : list of columns to create as data columns, or True to use all columns + Write as a PyTables Table structure which may perform + worse but allow more flexible operations like searching + / selecting subsets of the data + append : boolean, default True, append the input data to the + existing + data_columns : list of columns to create as data columns, or True to + use all columns min_itemsize : dict of columns that specify minimum string sizes nan_rep : string to use as string nan represenation chunksize : size to chunk the writing expectedrows : expected TOTAL row size of this table encoding : default None, provide an encoding for strings - dropna : boolean, default True, do not write an ALL nan row to the store - settable by the option 'io.hdf.dropna_table' + dropna : boolean, default True, do not write an ALL nan row to + the store settable by the option 'io.hdf.dropna_table' Notes ----- Does *not* check if data being appended overlaps with existing @@ -853,21 +883,24 @@ def append(self, key, value, format=None, append=True, columns=None, dropna=None if format is None: format = get_option("io.hdf.default_format") or 'table' kwargs = self._validate_format(format, kwargs) - self._write_to_group(key, value, append=append, dropna=dropna, **kwargs) + self._write_to_group(key, value, append=append, dropna=dropna, + **kwargs) - def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, dropna=True, **kwargs): + def append_to_multiple(self, d, value, selector, data_columns=None, + axes=None, dropna=True, **kwargs): """ Append to multiple tables Parameters ---------- - d : a dict of table_name to table_columns, None is acceptable as the values of - one node (this will get all the remaining columns) + d : a dict of table_name to table_columns, None is acceptable as the + values of one node (this will get all the remaining columns) value : a pandas object - selector : a string that designates the indexable table; all of its columns will - be designed as data_columns, unless data_columns is passed, in which - case these are used - data_columns : list of columns to create as data columns, or True to use all columns + selector : a string that designates the indexable table; all of its + columns will be designed as data_columns, unless data_columns is + passed, in which case these are used + data_columns : list of columns to create as data columns, or True to + use all columns dropna : if evaluates to True, drop rows from all tables if any single row in each table has all NaN @@ -879,15 +912,18 @@ def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, d if axes is not None: raise TypeError("axes is currently not accepted as a parameter to" " append_to_multiple; you can create the " - "tables indepdently instead") + "tables independently instead") if not isinstance(d, dict): raise ValueError( - "append_to_multiple must have a dictionary specified as the way to split the value") + "append_to_multiple must have a dictionary specified as the " + "way to split the value" + ) if selector not in d: raise ValueError( - "append_to_multiple requires a selector that is in passed dict") + "append_to_multiple requires a selector that is in passed dict" + ) # figure out the splitting axis (the non_index_axis) axis = list(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))[0] @@ -899,7 +935,9 @@ def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, d if v is None: if remain_key is not None: raise ValueError( - "append_to_multiple can only have one value in d that is None") + "append_to_multiple can only have one value in d that " + "is None" + ) remain_key = k else: remain_values.extend(v) @@ -952,15 +990,23 @@ def create_table_index(self, key, **kwargs): return if not s.is_table: - raise TypeError("cannot create table index on a Fixed format store") + raise TypeError( + "cannot create table index on a Fixed format store") s.create_index(**kwargs) def groups(self): - """ return a list of all the top-level nodes (that are not themselves a pandas storage object) """ + """return a list of all the top-level nodes (that are not themselves a + pandas storage object) + """ _tables() self._check_if_open() - return [g for g in self._handle.walkNodes() if getattr(g._v_attrs, 'pandas_type', None) or getattr( - g, 'table', None) or (isinstance(g, _table_mod.table.Table) and g._v_name != u('table'))] + return [ + g for g in self._handle.walkNodes() + if (getattr(g._v_attrs, 'pandas_type', None) or + getattr(g, 'table', None) or + (isinstance(g, _table_mod.table.Table) and + g._v_name != u('table'))) + ] def get_node(self, key): """ return the node with the key or None if it does not exist """ @@ -981,16 +1027,16 @@ def get_storer(self, key): s.infer_axes() return s - def copy( - self, file, mode='w', propindexes=True, keys=None, complib = None, complevel = None, - fletcher32=False, overwrite=True): + def copy(self, file, mode='w', propindexes=True, keys=None, complib=None, + complevel=None, fletcher32=False, overwrite=True): """ copy the existing store to a new file, upgrading in place Parameters ---------- propindexes: restore indexes in copied file (defaults to True) keys : list of keys to include in the copy (defaults to all) - overwrite : overwrite (remove and replace) existing nodes in the new store (default is True) + overwrite : overwrite (remove and replace) existing nodes in the + new store (default is True) mode, complib, complevel, fletcher32 same as in HDFStore.__init__ Returns @@ -1022,8 +1068,11 @@ def copy( index = False if propindexes: index = [a.name for a in s.axes if a.is_indexed] - new_store.append(k, data, index=index, data_columns=getattr( - s, 'data_columns', None), encoding=s.encoding) + new_store.append( + k, data, index=index, + data_columns=getattr(s, 'data_columns', None), + encoding=s.encoding + ) else: new_store.put(k, data, encoding=s.encoding) @@ -1039,10 +1088,10 @@ def _validate_format(self, format, kwargs): kwargs = kwargs.copy() # table arg - table = kwargs.pop('table',None) + table = kwargs.pop('table', None) if table is not None: - warnings.warn(format_deprecate_doc,FutureWarning) + warnings.warn(format_deprecate_doc, FutureWarning) if table: format = 'table' @@ -1053,17 +1102,21 @@ def _validate_format(self, format, kwargs): try: kwargs['format'] = _FORMAT_MAP[format.lower()] except: - raise TypeError("invalid HDFStore format specified [{0}]".format(format)) + raise TypeError("invalid HDFStore format specified [{0}]" + .format(format)) return kwargs - def _create_storer(self, group, format=None, value=None, append=False, **kwargs): + def _create_storer(self, group, format=None, value=None, append=False, + **kwargs): """ return a suitable class to operate """ def error(t): raise TypeError( - "cannot properly create the storer for: [%s] [group->%s,value->%s,format->%s,append->%s,kwargs->%s]" % - (t, group, type(value), format, append, kwargs)) + "cannot properly create the storer for: [%s] [group->%s," + "value->%s,format->%s,append->%s,kwargs->%s]" + % (t, group, type(value), format, append, kwargs) + ) pt = _ensure_decoded(getattr(group._v_attrs, 'pandas_type', None)) tt = _ensure_decoded(getattr(group._v_attrs, 'table_type', None)) @@ -1073,12 +1126,14 @@ def error(t): if value is None: _tables() - if getattr(group, 'table', None) or isinstance(group, _table_mod.table.Table): + if (getattr(group, 'table', None) or + isinstance(group, _table_mod.table.Table)): pt = u('frame_table') tt = u('generic_table') else: raise TypeError( - "cannot create a storer if the object is not existing nor a value are passed") + "cannot create a storer if the object is not existing " + "nor a value are passed") else: try: @@ -1104,14 +1159,14 @@ def error(t): if value is not None: if pt == u('series_table'): - index = getattr(value,'index',None) + index = getattr(value, 'index', None) if index is not None: if index.nlevels == 1: tt = u('appendable_series') elif index.nlevels > 1: tt = u('appendable_multiseries') elif pt == u('frame_table'): - index = getattr(value,'index',None) + index = getattr(value, 'index', None) if index is not None: if index.nlevels == 1: tt = u('appendable_frame') @@ -1138,8 +1193,7 @@ def error(t): except: error('_TABLE_MAP') - def _write_to_group( - self, key, value, format, index=True, append=False, + def _write_to_group(self, key, value, format, index=True, append=False, complib=None, encoding=None, **kwargs): group = self.get_node(key) @@ -1150,7 +1204,7 @@ def _write_to_group( # we don't want to store a table node at all if are object is 0-len # as there are not dtypes - if getattr(value,'empty',None) and (format == 'table' or append): + if getattr(value, 'empty', None) and (format == 'table' or append): return if group is None: @@ -1175,7 +1229,8 @@ def _write_to_group( if append: # raise if we are trying to append to a Fixed format, # or a table that exists (and we are putting) - if not s.is_table or (s.is_table and format == 'fixed' and s.is_exists): + if (not s.is_table or + (s.is_table and format == 'fixed' and s.is_exists)): raise ValueError('Can only append to Tables') if not s.is_exists: s.set_object_info() @@ -1183,7 +1238,9 @@ def _write_to_group( s.set_object_info() if not s.is_table and complib: - raise ValueError('Compression not supported on Fixed format stores') + raise ValueError( + 'Compression not supported on Fixed format stores' + ) # write the object s.write(obj=value, append=append, complib=complib, **kwargs) @@ -1210,8 +1267,8 @@ class TableIterator(object): start : the passed start value (default is None) stop : the passed stop value (default is None) chunksize : the passed chunking valeu (default is 50000) - auto_close : boolean, automatically close the store at the end of iteration, - default is False + auto_close : boolean, automatically close the store at the end of + iteration, default is False kwargs : the passed kwargs """ @@ -1274,10 +1331,9 @@ class IndexCol(StringMixin): is_data_indexable = True _info_fields = ['freq', 'tz', 'index_name'] - def __init__( - self, values=None, kind=None, typ=None, cname=None, itemsize=None, - name=None, axis=None, kind_attr=None, pos=None, freq=None, tz=None, - index_name=None, **kwargs): + def __init__(self, values=None, kind=None, typ=None, cname=None, + itemsize=None, name=None, axis=None, kind_attr=None, pos=None, + freq=None, tz=None, index_name=None, **kwargs): self.values = values self.kind = kind self.typ = typ @@ -1335,7 +1391,8 @@ def __unicode__(self): def __eq__(self, other): """ compare 2 col items """ - return all([getattr(self, a, None) == getattr(other, a, None) for a in ['name', 'cname', 'axis', 'pos']]) + return all([getattr(self, a, None) == getattr(other, a, None) + for a in ['name', 'cname', 'axis', 'pos']]) def __ne__(self, other): return not self.__eq__(other) @@ -1353,7 +1410,7 @@ def copy(self): return new_self def infer(self, table): - """ infer this column from the table: create and return a new object """ + """infer this column from the table: create and return a new object""" new_self = self.copy() new_self.set_table(table) new_self.get_attr() @@ -1420,7 +1477,8 @@ def __iter__(self): def maybe_set_size(self, min_itemsize=None, **kwargs): """ maybe set a string col itemsize: - min_itemsize can be an interger or a dict with this columns name with an integer size """ + min_itemsize can be an interger or a dict with this columns name + with an integer size """ if _ensure_decoded(self.kind) == u('string'): if isinstance(min_itemsize, dict): @@ -1446,10 +1504,11 @@ def validate_col(self, itemsize=None): if itemsize is None: itemsize = self.itemsize if c.itemsize < itemsize: - raise ValueError("Trying to store a string with len [%s] in [%s] column but\n" - "this column has a limit of [%s]!\n" - "Consider using min_itemsize to preset the sizes on these columns" - % (itemsize, self.cname, c.itemsize)) + raise ValueError( + "Trying to store a string with len [%s] in [%s] " + "column but\nthis column has a limit of [%s]!\n" + "Consider using min_itemsize to preset the sizes on " + "these columns" % (itemsize, self.cname, c.itemsize)) return c.itemsize return None @@ -1484,9 +1543,10 @@ def update_info(self, info): setattr(self, key, None) else: - raise ValueError("invalid info for [%s] for [%s]""" - ", existing_value [%s] conflicts with new value [%s]" % (self.name, - key, existing_value, value)) + raise ValueError( + "invalid info for [%s] for [%s], existing_value [%s] " + "conflicts with new value [%s]" + % (self.name, key, existing_value, value)) else: if value is not None or existing_value is not None: idx[key] = value @@ -1537,7 +1597,8 @@ class DataCol(IndexCol): ---------- data : the actual data - cname : the column name in the table to hold the data (typeically values) + cname : the column name in the table to hold the data (typically + values) """ is_an_indexable = False is_data_indexable = False @@ -1574,11 +1635,14 @@ def __init__(self, values=None, kind=None, typ=None, self.set_data(data) def __unicode__(self): - return "name->%s,cname->%s,dtype->%s,shape->%s" % (self.name, self.cname, self.dtype, self.shape) + return "name->%s,cname->%s,dtype->%s,shape->%s" % ( + self.name, self.cname, self.dtype, self.shape + ) def __eq__(self, other): """ compare 2 col items """ - return all([getattr(self, a, None) == getattr(other, a, None) for a in ['name', 'cname', 'dtype', 'pos']]) + return all([getattr(self, a, None) == getattr(other, a, None) + for a in ['name', 'cname', 'dtype', 'pos']]) def set_data(self, data, dtype=None): self.data = data @@ -1644,7 +1708,9 @@ def set_atom(self, block, existing_col, min_itemsize, # if this block has more than one timezone, raise if len(set([r.tzinfo for r in rvalues])) != 1: raise TypeError( - "too many timezones in this block, create separate data columns") + "too many timezones in this block, create separate " + "data columns" + ) # convert this column to datetime64[ns] utc, and save the tz index = DatetimeIndex(rvalues) @@ -1707,9 +1773,11 @@ def set_atom_string( col = block.get(item) inferred_type = lib.infer_dtype(col.ravel()) if inferred_type != 'string': - raise TypeError("Cannot serialize the column [%s] because\n" - "its data contents are [%s] object dtype" % - (item, inferred_type)) + raise TypeError( + "Cannot serialize the column [%s] because\n" + "its data contents are [%s] object dtype" + % (item, inferred_type) + ) # itemsize is the maximum length of a string (along any dimension) itemsize = lib.max_len_string_array(com._ensure_object(data.ravel())) @@ -1781,7 +1849,7 @@ def cvalues(self): return self.data def validate_attr(self, append): - """ validate that we have the same order as the existing & same dtype """ + """validate that we have the same order as the existing & same dtype""" if append: existing_fields = getattr(self.attrs, self.kind_attr, None) if (existing_fields is not None and @@ -1792,11 +1860,13 @@ def validate_attr(self, append): existing_dtype = getattr(self.attrs, self.dtype_attr, None) if (existing_dtype is not None and existing_dtype != self.dtype): - raise ValueError("appended items dtype do not match existing items dtype" - " in table!") + raise ValueError("appended items dtype do not match existing " + "items dtype in table!") def convert(self, values, nan_rep, encoding): - """ set the data from this selection (and convert to the correct dtype if we can) """ + """set the data from this selection (and convert to the correct dtype + if we can) + """ try: values = values[self.cname] except: @@ -1829,9 +1899,10 @@ def convert(self, values, nan_rep, encoding): try: self.data = np.array( [date.fromordinal(v) for v in self.data], dtype=object) - except (ValueError): + except ValueError: self.data = np.array( - [date.fromtimestamp(v) for v in self.data], dtype=object) + [date.fromtimestamp(v) for v in self.data], + dtype=object) elif dtype == u('datetime'): self.data = np.array( [datetime.fromtimestamp(v) for v in self.data], @@ -1914,7 +1985,8 @@ def __init__(self, parent, group, encoding=None, **kwargs): @property def is_old_version(self): - return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1 + return (self.version[0] <= 0 and self.version[1] <= 10 and + self.version[2] < 1) def set_version(self): """ compute and set our version """ @@ -1929,7 +2001,8 @@ def set_version(self): @property def pandas_type(self): - return _ensure_decoded(getattr(self.group._v_attrs, 'pandas_type', None)) + return _ensure_decoded(getattr(self.group._v_attrs, + 'pandas_type', None)) @property def format_type(self): @@ -2041,7 +2114,9 @@ def write(self, **kwargs): "cannot write on an abstract storer: sublcasses should implement") def delete(self, where=None, **kwargs): - """ support fully deleting the node in its entirety (only) - where specification must be None """ + """support fully deleting the node in its entirety (only) - where + specification must be None + """ if where is None: self._handle.removeNode(self.group, recursive=True) return None @@ -2052,8 +2127,7 @@ def delete(self, where=None, **kwargs): class GenericFixed(Fixed): """ a generified fixed version """ - _index_type_map = {DatetimeIndex: 'datetime', - PeriodIndex: 'period'} + _index_type_map = {DatetimeIndex: 'datetime', PeriodIndex: 'period'} _reverse_index_map = dict([(v, k) for k, v in compat.iteritems(_index_type_map)]) attributes = [] @@ -2078,11 +2152,13 @@ def f(values, freq=None, tz=None): def validate_read(self, kwargs): if kwargs.get('columns') is not None: - raise TypeError("cannot pass a column specification when reading a Fixed format store." - "this store must be selected in its entirety") + raise TypeError("cannot pass a column specification when reading " + "a Fixed format store. this store must be " + "selected in its entirety") if kwargs.get('where') is not None: - raise TypeError("cannot pass a where specification when reading from a Fixed format store." - "this store must be selected in its entirety") + raise TypeError("cannot pass a where specification when reading " + "from a Fixed format store. this store must be " + "selected in its entirety") @property def is_exists(self): @@ -2246,9 +2322,10 @@ def read_index_node(self, node): data = node[:] # If the index was an empty array write_array_empty() will # have written a sentinel. Here we relace it with the original. - if 'shape' in node._v_attrs \ - and self._is_empty_array(getattr(node._v_attrs, 'shape')): - data = np.empty(getattr(node._v_attrs, 'shape'), dtype=getattr(node._v_attrs, 'value_type')) + if ('shape' in node._v_attrs and + self._is_empty_array(getattr(node._v_attrs, 'shape'))): + data = np.empty(getattr(node._v_attrs, 'shape'), + dtype=getattr(node._v_attrs, 'value_type')) kind = _ensure_decoded(node._v_attrs.kind) name = None @@ -2268,8 +2345,8 @@ def read_index_node(self, node): if kind in (u('date'), u('datetime')): index = factory( - _unconvert_index(data, kind, encoding=self.encoding), dtype=object, - **kwargs) + _unconvert_index(data, kind, encoding=self.encoding), + dtype=object, **kwargs) else: index = factory( _unconvert_index(data, kind, encoding=self.encoding), **kwargs) @@ -2351,10 +2428,12 @@ def write_array(self, key, value, items=None): else: if value.dtype.type == np.datetime64: self._handle.createArray(self.group, key, value.view('i8')) - getattr(self.group, key)._v_attrs.value_type = 'datetime64' + getattr( + self.group, key)._v_attrs.value_type = 'datetime64' elif value.dtype.type == np.timedelta64: self._handle.createArray(self.group, key, value.view('i8')) - getattr(self.group, key)._v_attrs.value_type = 'timedelta64' + getattr( + self.group, key)._v_attrs.value_type = 'timedelta64' else: self._handle.createArray(self.group, key, value) @@ -2423,7 +2502,8 @@ def read(self, **kwargs): sp_values = self.read_array('sp_values') sp_index = self.read_index('sp_index') return SparseSeries(sp_values, index=index, sparse_index=sp_index, - kind=self.kind or u('block'), fill_value=self.fill_value, + kind=self.kind or u('block'), + fill_value=self.fill_value, name=self.name) def write(self, obj, **kwargs): @@ -2596,14 +2676,20 @@ class Table(Fixed): Attrs in Table Node ------------------- - These are attributes that are store in the main table node, they are necessary - to recreate these tables when read back in. - - index_axes : a list of tuples of the (original indexing axis and index column) - non_index_axes: a list of tuples of the (original index axis and columns on a non-indexing axis) - values_axes : a list of the columns which comprise the data of this table - data_columns : a list of the columns that we are allowing indexing (these become single columns in values_axes), or True to force all columns - nan_rep : the string to use for nan representations for string objects + These are attributes that are store in the main table node, they are + necessary to recreate these tables when read back in. + + index_axes : a list of tuples of the (original indexing axis and + index column) + non_index_axes: a list of tuples of the (original index axis and + columns on a non-indexing axis) + values_axes : a list of the columns which comprise the data of this + table + data_columns : a list of the columns that we are allowing indexing + (these become single columns in values_axes), or True to force all + columns + nan_rep : the string to use for nan representations for string + objects levels : the names of levels """ @@ -2641,14 +2727,10 @@ def __unicode__(self): if self.is_old_version: ver = "[%s]" % '.'.join([str(x) for x in self.version]) - return "%-12.12s%s (typ->%s,nrows->%s,ncols->%s,indexers->[%s]%s)" % (self.pandas_type, - ver, - self.table_type_short, - self.nrows, - self.ncols, - ','.join( - [a.name for a in self.index_axes]), - dc) + return "%-12.12s%s (typ->%s,nrows->%s,ncols->%s,indexers->[%s]%s)" % ( + self.pandas_type, ver, self.table_type_short, self.nrows, + self.ncols, ','.join([a.name for a in self.index_axes]), dc + ) def __getitem__(self, c): """ return the axis for c """ @@ -2676,25 +2758,30 @@ def validate(self, other): oax = ov[i] if sax != oax: raise ValueError( - "invalid combinate of [%s] on appending data [%s] vs current table [%s]" % - (c, sax, oax)) + "invalid combinate of [%s] on appending data [%s] " + "vs current table [%s]" % (c, sax, oax)) # should never get here raise Exception( - "invalid combinate of [%s] on appending data [%s] vs current table [%s]" % (c, sv, ov)) + "invalid combinate of [%s] on appending data [%s] vs " + "current table [%s]" % (c, sv, ov)) @property def is_multi_index(self): - """ the levels attribute is 1 or a list in the case of a multi-index """ - return isinstance(self.levels,list) + """the levels attribute is 1 or a list in the case of a multi-index""" + return isinstance(self.levels, list) def validate_multiindex(self, obj): - """ validate that we can store the multi-index; reset and return the new object """ - levels = [ l if l is not None else "level_{0}".format(i) for i, l in enumerate(obj.index.names) ] + """validate that we can store the multi-index; reset and return the + new object + """ + levels = [l if l is not None else "level_{0}".format(i) + for i, l in enumerate(obj.index.names)] try: return obj.reset_index(), levels - except (ValueError): - raise ValueError("duplicate names/columns in the multi-index when storing as a table") + except ValueError: + raise ValueError("duplicate names/columns in the multi-index when " + "storing as a table") @property def nrows_expected(self): @@ -2738,17 +2825,21 @@ def is_transposed(self): @property def data_orientation(self): - """ return a tuple of my permutated axes, non_indexable at the front """ - return tuple(itertools.chain([int(a[0]) for a in self.non_index_axes], [int(a.axis) for a in self.index_axes])) + """return a tuple of my permutated axes, non_indexable at the front""" + return tuple(itertools.chain([int(a[0]) for a in self.non_index_axes], + [int(a.axis) for a in self.index_axes])) def queryables(self): """ return a dict of the kinds allowable columns for this object """ # compute the values_axes queryables - return dict([(a.cname, a.kind) for a in self.index_axes] + - [(self.storage_obj_type._AXIS_NAMES[axis], None) for axis, values in self.non_index_axes] + - [(v.cname, v.kind) for v in self.values_axes if v.name in set(self.data_columns)] - ) + return dict( + [(a.cname, a.kind) for a in self.index_axes] + + [(self.storage_obj_type._AXIS_NAMES[axis], None) + for axis, values in self.non_index_axes] + + [(v.cname, v.kind) for v in self.values_axes + if v.name in set(self.data_columns)] + ) def index_cols(self): """ return a list of my index cols """ @@ -2788,22 +2879,26 @@ def get_attrs(self): self.levels = getattr( self.attrs, 'levels', None) or [] t = self.table - self.index_axes = [a.infer(t) - for a in self.indexables if a.is_an_indexable] - self.values_axes = [a.infer(t) - for a in self.indexables if not a.is_an_indexable] + self.index_axes = [ + a.infer(t) for a in self.indexables if a.is_an_indexable + ] + self.values_axes = [ + a.infer(t) for a in self.indexables if not a.is_an_indexable + ] def validate_version(self, where=None): """ are we trying to operate on an old version? """ if where is not None: - if self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1: + if (self.version[0] <= 0 and self.version[1] <= 10 and + self.version[2] < 1): ws = incompatibility_doc % '.'.join( [str(x) for x in self.version]) warnings.warn(ws, IncompatibilityWarning) def validate_min_itemsize(self, min_itemsize): - """ validate the min_itemisze doesn't contain items that are not in the axes - this needs data_columns to be defined """ + """validate the min_itemisze doesn't contain items that are not in the + axes this needs data_columns to be defined + """ if min_itemsize is None: return if not isinstance(min_itemsize, dict): @@ -2817,8 +2912,8 @@ def validate_min_itemsize(self, min_itemsize): continue if k not in q: raise ValueError( - "min_itemsize has the key [%s] which is not an axis or data_column" % - k) + "min_itemsize has the key [%s] which is not an axis or " + "data_column" % k) @property def indexables(self): @@ -2828,8 +2923,10 @@ def indexables(self): self._indexables = [] # index columns - self._indexables.extend([IndexCol(name=name, axis=axis, pos=i) - for i, (axis, name) in enumerate(self.attrs.index_cols)]) + self._indexables.extend([ + IndexCol(name=name, axis=axis, pos=i) + for i, (axis, name) in enumerate(self.attrs.index_cols) + ]) # values columns dc = set(self.data_columns) @@ -2839,7 +2936,8 @@ def f(i, c): klass = DataCol if c in dc: klass = DataIndexableCol - return klass.create_for_block(i=i, name=c, pos=base_pos + i, version=self.version) + return klass.create_for_block(i=i, name=c, pos=base_pos + i, + version=self.version) self._indexables.extend( [f(i, c) for i, c in enumerate(self.attrs.values_cols)]) @@ -2854,7 +2952,8 @@ def create_index(self, columns=None, optlevel=None, kind=None): Paramaters ---------- - columns : False (don't create an index), True (create all columns index), None or list_like (the indexers to index) + columns : False (don't create an index), True (create all columns + index), None or list_like (the indexers to index) optlevel: optimization level (defaults to 6) kind : kind of index (defaults to 'medium') @@ -2907,7 +3006,9 @@ def create_index(self, columns=None, optlevel=None, kind=None): v.createIndex(**kw) def read_axes(self, where, **kwargs): - """ create and return the axes sniffed from the table: return boolean for success """ + """create and return the axes sniffed from the table: return boolean + for success + """ # validate the version self.validate_version(where) @@ -2932,15 +3033,18 @@ def get_object(self, obj): return obj def validate_data_columns(self, data_columns, min_itemsize): - """ take the input data_columns and min_itemize and create a data_columns spec """ + """take the input data_columns and min_itemize and create a data + columns spec + """ if not len(self.non_index_axes): return [] axis, axis_labels = self.non_index_axes[0] - info = self.info.get(axis,dict()) + info = self.info.get(axis, dict()) if info.get('type') == 'MultiIndex' and data_columns is not None: - raise ValueError("cannot use a multi-index on axis [{0}] with data_columns".format(axis)) + raise ValueError("cannot use a multi-index on axis [{0}] with " + "data_columns".format(axis)) # evaluate the passed data_columns, True == use all columns # take only valide axis labels @@ -2953,8 +3057,10 @@ def validate_data_columns(self, data_columns, min_itemsize): if isinstance(min_itemsize, dict): existing_data_columns = set(data_columns) - data_columns.extend( - [k for k in min_itemsize.keys() if k != 'values' and k not in existing_data_columns]) + data_columns.extend([ + k for k in min_itemsize.keys() + if k != 'values' and k not in existing_data_columns + ]) # return valid columns in the order of our axis return [c for c in data_columns if c in axis_labels] @@ -2962,17 +3068,21 @@ def validate_data_columns(self, data_columns, min_itemsize): def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, min_itemsize=None, **kwargs): """ create and return the axes - leagcy tables create an indexable column, indexable index, non-indexable fields + leagcy tables create an indexable column, indexable index, + non-indexable fields Parameters: ----------- - axes: a list of the axes in order to create (names or numbers of the axes) + axes: a list of the axes in order to create (names or numbers of + the axes) obj : the object to create axes on - validate: validate the obj against an existiing object already written + validate: validate the obj against an existing object already + written min_itemsize: a dict of the min size for a column in bytes nan_rep : a values to use for string column nan_rep encoding : the encoding for string values - data_columns : a list of columns that we want to create separate to allow indexing (or True will force all colummns) + data_columns : a list of columns that we want to create separate to + allow indexing (or True will force all columns) """ @@ -2981,8 +3091,9 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, try: axes = _AXES_MAP[type(obj)] except: - raise TypeError("cannot properly create the storer for: [group->%s,value->%s]" % - (self.group._v_name, type(obj))) + raise TypeError("cannot properly create the storer for: " + "[group->%s,value->%s]" + % (self.group._v_name, type(obj))) # map axes to numbers axes = [obj._get_axis_number(a) for a in axes] @@ -3021,7 +3132,8 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, if i in axes: name = obj._AXIS_NAMES[i] index_axes_map[i] = _convert_index( - a, self.encoding, self.format_type).set_name(name).set_axis(i) + a, self.encoding, self.format_type + ).set_name(name).set_axis(i) else: # we might be able to change the axes on the appending data if @@ -3037,16 +3149,17 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, append_axis = exist_axis # the non_index_axes info - info = _get_info(self.info,i) + info = _get_info(self.info, i) info['names'] = list(a.names) info['type'] = a.__class__.__name__ self.non_index_axes.append((i, append_axis)) # set axis positions (based on the axes) - self.index_axes = [index_axes_map[a].set_pos( - j).update_info(self.info) for j, - a in enumerate(axes)] + self.index_axes = [ + index_axes_map[a].set_pos(j).update_info(self.info) + for j, a in enumerate(axes) + ] j = len(self.index_axes) # check for column conflicts @@ -3066,11 +3179,13 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns = self.validate_data_columns( data_columns, min_itemsize) if len(data_columns): - blocks = block_obj.reindex_axis(Index(axis_labels) - Index( - data_columns), axis=axis)._data.blocks + blocks = block_obj.reindex_axis( + Index(axis_labels) - Index(data_columns), + axis=axis + )._data.blocks for c in data_columns: - blocks.extend(block_obj.reindex_axis( - [c], axis=axis)._data.blocks) + blocks.extend( + block_obj.reindex_axis([c], axis=axis)._data.blocks) # reorder the blocks in the same order as the existing_table if we can if existing_table is not None: @@ -3097,7 +3212,8 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, name = None # we have a data_column - if data_columns and len(b.items) == 1 and b.items[0] in data_columns: + if (data_columns and len(b.items) == 1 and + b.items[0] in data_columns): klass = DataIndexableCol name = b.items[0] self.data_columns.append(name) @@ -3108,8 +3224,9 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, try: existing_col = existing_table.values_axes[i] except: - raise ValueError("Incompatible appended table [%s] with existing table [%s]" % - (blocks, existing_table.values_axes)) + raise ValueError("Incompatible appended table [%s] with " + "existing table [%s]" + % (blocks, existing_table.values_axes)) else: existing_col = None @@ -3128,9 +3245,12 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, self.values_axes.append(col) except (NotImplementedError, ValueError, TypeError) as e: raise e - except (Exception) as detail: - raise Exception("cannot find the correct atom type -> [dtype->%s,items->%s] %s" % ( - b.dtype.name, b.items, str(detail))) + except Exception as detail: + raise Exception( + "cannot find the correct atom type -> " + "[dtype->%s,items->%s] %s" + % (b.dtype.name, b.items, str(detail)) + ) j += 1 # validate our min_itemsize @@ -3160,7 +3280,8 @@ def process_filter(field, filt): # see if the field is the name of an axis if field == axis_name: takers = op(axis_values, filt) - return obj.ix._getitem_axis(takers, axis=axis_number) + return obj.ix._getitem_axis(takers, + axis=axis_number) # this might be the name of a file IN an axis elif field in axis_values: @@ -3173,7 +3294,8 @@ def process_filter(field, filt): if isinstance(obj, DataFrame): axis_number = 1 - axis_number takers = op(values, filt) - return obj.ix._getitem_axis(takers, axis=axis_number) + return obj.ix._getitem_axis(takers, + axis=axis_number) raise ValueError( "cannot find the field [%s] for filtering!" % field) @@ -3182,8 +3304,8 @@ def process_filter(field, filt): return obj - def create_description( - self, complib=None, complevel=None, fletcher32=False, expectedrows=None): + def create_description(self, complib=None, complevel=None, + fletcher32=False, expectedrows=None): """ create the description of the table from the axes & values """ # expected rows estimate @@ -3197,9 +3319,9 @@ def create_description( if complib: if complevel is None: complevel = self._complevel or 9 - filters = _tables().Filters(complevel=complevel, - complib=complib, - fletcher32=fletcher32 or self._fletcher32) + filters = _tables().Filters( + complevel=complevel, complib=complib, + fletcher32=fletcher32 or self._fletcher32) d['filters'] = filters elif self._filters is not None: d['filters'] = self._filters @@ -3207,7 +3329,9 @@ def create_description( return d def read_coordinates(self, where=None, start=None, stop=None, **kwargs): - """ select coordinates (row numbers) from a table; return the coordinates object """ + """select coordinates (row numbers) from a table; return the + coordinates object + """ # validate the version self.validate_version(where) @@ -3222,7 +3346,9 @@ def read_coordinates(self, where=None, start=None, stop=None, **kwargs): return Index(self.selection.select_coords()) def read_column(self, column, where=None, **kwargs): - """ return a single column from the table, generally only indexables are interesting """ + """return a single column from the table, generally only indexables + are interesting + """ # validate the version self.validate_version() @@ -3241,13 +3367,14 @@ def read_column(self, column, where=None, **kwargs): if not a.is_data_indexable: raise ValueError( - "column [%s] can not be extracted individually; it is not data indexable" % - column) + "column [%s] can not be extracted individually; it is " + "not data indexable" % column) # column must be an indexable or a data column c = getattr(self.table.cols, column) a.set_info(self.info) - return Series(a.convert(c[:], nan_rep=self.nan_rep, encoding=self.encoding).take_data()) + return Series(a.convert(c[:], nan_rep=self.nan_rep, + encoding=self.encoding).take_data()) raise KeyError("column [%s] not found in the table" % column) @@ -3268,7 +3395,8 @@ def read(self, **kwargs): def write(self, **kwargs): """ write in a format that we can search later on (but cannot append to): write out the indicies and the values using _write_array - (e.g. a CArray) create an indexing table so that we can search""" + (e.g. a CArray) create an indexing table so that we can search + """ raise NotImplementedError("WORKTable needs to implement write") @@ -3279,11 +3407,12 @@ class LegacyTable(Table): append (but doesn't require them), and stores the data in a format that can be easily searched - """ - _indexables = [IndexCol(name='index', axis=1, pos=0), - IndexCol(name='column', axis=2, - pos=1, index_kind='columns_kind'), - DataCol(name='fields', cname='values', kind_attr='fields', pos=2)] + """ + _indexables = [ + IndexCol(name='index', axis=1, pos=0), + IndexCol(name='column', axis=2, pos=1, index_kind='columns_kind'), + DataCol(name='fields', cname='values', kind_attr='fields', pos=2) + ] table_type = u('legacy') ndim = 3 @@ -3291,7 +3420,9 @@ def write(self, **kwargs): raise TypeError("write operations are not allowed on legacy tables!") def read(self, where=None, columns=None, **kwargs): - """ we have n indexable columns, with an arbitrary number of data axes """ + """we have n indexable columns, with an arbitrary number of data + axes + """ if not self.read_axes(where=where, **kwargs): return None @@ -3395,8 +3526,8 @@ class AppendableTable(LegacyTable): table_type = u('appendable') def write(self, obj, axes=None, append=False, complib=None, - complevel=None, fletcher32=None, min_itemsize=None, chunksize=None, - expectedrows=None, dropna=True, **kwargs): + complevel=None, fletcher32=None, min_itemsize=None, + chunksize=None, expectedrows=None, dropna=True, **kwargs): if not append and self.is_exists: self._handle.removeNode(self.group, 'table') @@ -3485,7 +3616,7 @@ def write_data(self, chunksize, dropna=True): # reshape the values if needed values = [a.take_data() for a in self.values_axes] values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) - for v in values] + for v in values] bvalues = [] for i, v in enumerate(values): new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape @@ -3617,7 +3748,8 @@ def read(self, where=None, columns=None, **kwargs): if not self.read_axes(where=where, **kwargs): return None - info = self.info.get(self.non_index_axes[0][0],dict()) if len(self.non_index_axes) else dict() + info = (self.info.get(self.non_index_axes[0][0], dict()) + if len(self.non_index_axes) else dict()) index = self.index_axes[0].values frames = [] for a in self.values_axes: @@ -3630,7 +3762,7 @@ def read(self, where=None, columns=None, **kwargs): cols = Index(a.values) names = info.get('names') if names is not None: - cols.set_names(names,inplace=True) + cols.set_names(names, inplace=True) if self.is_transposed: values = a.cvalues @@ -3679,9 +3811,10 @@ def write(self, obj, data_columns=None, **kwargs): """ we are going to write this as a frame table """ if not isinstance(obj, DataFrame): name = obj.name or 'values' - obj = DataFrame({ name : obj }, index=obj.index) + obj = DataFrame({name: obj}, index=obj.index) obj.columns = [name] - return super(AppendableSeriesTable, self).write(obj=obj, data_columns=obj.columns, **kwargs) + return super(AppendableSeriesTable, self).write( + obj=obj, data_columns=obj.columns, **kwargs) def read(self, columns=None, **kwargs): @@ -3694,13 +3827,14 @@ def read(self, columns=None, **kwargs): if is_multi_index: s.set_index(self.levels, inplace=True) - s = s.iloc[:,0] + s = s.iloc[:, 0] # remove the default name if s.name == 'values': s.name = None return s + class AppendableMultiSeriesTable(AppendableSeriesTable): """ support the new appendable table formats """ pandas_kind = u('series_table') @@ -3715,8 +3849,8 @@ def write(self, obj, **kwargs): obj.columns = cols return super(AppendableMultiSeriesTable, self).write(obj=obj, **kwargs) -class GenericTable(AppendableFrameTable): +class GenericTable(AppendableFrameTable): """ a table that read/writes the generic pytables table format """ pandas_kind = u('frame_table') table_type = u('generic_table') @@ -3756,7 +3890,7 @@ def indexables(self): for i, n in enumerate(d._v_names): dc = GenericDataIndexableCol( - name=n, pos=i, values=[n], version = self.version) + name=n, pos=i, values=[n], version=self.version) self._indexables.append(dc) return self._indexables @@ -3786,7 +3920,8 @@ def write(self, obj, data_columns=None, **kwargs): for n in self.levels: if n not in data_columns: data_columns.insert(0, n) - return super(AppendableMultiFrameTable, self).write(obj=obj, data_columns=data_columns, **kwargs) + return super(AppendableMultiFrameTable, self).write( + obj=obj, data_columns=data_columns, **kwargs) def read(self, columns=None, **kwargs): if columns is not None: @@ -3798,7 +3933,9 @@ def read(self, columns=None, **kwargs): df = df.set_index(self.levels) # remove names for 'level_%d' - df.index = df.index.set_names([ None if self._re_levels.search(l) else l for l in df.index.names ]) + df.index = df.index.set_names([ + None if self._re_levels.search(l) else l for l in df.index.names + ]) return df @@ -3844,11 +3981,12 @@ def _reindex_axis(obj, axis, labels, other=None): if other is not None: labels = labels & _ensure_index(other.unique()) if not labels.equals(ax): - slicer = [ slice(None, None) ] * obj.ndim + slicer = [slice(None, None)] * obj.ndim slicer[axis] = labels obj = obj.loc[tuple(slicer)] return obj + def _get_info(info, name): """ get/create the info for this name """ try: @@ -3857,19 +3995,21 @@ def _get_info(info, name): idx = info[name] = dict() return idx + def _convert_index(index, encoding=None, format_type=None): index_name = getattr(index, 'name', None) if isinstance(index, DatetimeIndex): converted = index.asi8 return IndexCol(converted, 'datetime64', _tables().Int64Col(), - freq=getattr(index, 'freq', None), tz=getattr(index, 'tz', None), + freq=getattr(index, 'freq', None), + tz=getattr(index, 'tz', None), index_name=index_name) elif isinstance(index, (Int64Index, PeriodIndex)): atom = _tables().Int64Col() return IndexCol( index.values, 'integer', atom, freq=getattr(index, 'freq', None), - index_name=index_name) + index_name=index_name) if isinstance(index, MultiIndex): raise TypeError('MultiIndex not supported here!') @@ -3881,7 +4021,8 @@ def _convert_index(index, encoding=None, format_type=None): if inferred_type == 'datetime64': converted = values.view('i8') return IndexCol(converted, 'datetime64', _tables().Int64Col(), - freq=getattr(index, 'freq', None), tz=getattr(index, 'tz', None), + freq=getattr(index, 'freq', None), + tz=getattr(index, 'tz', None), index_name=index_name) elif inferred_type == 'datetime': converted = np.array([(time.mktime(v.timetuple()) + @@ -3901,15 +4042,18 @@ def _convert_index(index, encoding=None, format_type=None): converted = _convert_string_array(values, encoding) itemsize = converted.dtype.itemsize return IndexCol( - converted, 'string', _tables().StringCol(itemsize), itemsize=itemsize, - index_name=index_name) + converted, 'string', _tables().StringCol(itemsize), + itemsize=itemsize, index_name=index_name + ) elif inferred_type == 'unicode': if format_type == 'fixed': atom = _tables().ObjectAtom() return IndexCol(np.asarray(values, dtype='O'), 'object', atom, index_name=index_name) raise TypeError( - "[unicode] is not supported as a in index type for [{0}] formats".format(format_type)) + "[unicode] is not supported as a in index type for [{0}] formats" + .format(format_type) + ) elif inferred_type == 'integer': # take a guess for now, hope the values fit @@ -4027,6 +4171,7 @@ def _need_convert(kind): return True return False + class Selection(object): """ @@ -4065,9 +4210,14 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs): stop = self.table.nrows self.coordinates = np.arange(start, stop)[where] elif issubclass(where.dtype.type, np.integer): - if (self.start is not None and (where < self.start).any()) or (self.stop is not None and (where >= self.stop).any()): + if ((self.start is not None and + (where < self.start).any()) or + (self.stop is not None and + (where >= self.stop).any())): raise ValueError( - "where must have index locations >= start and < stop") + "where must have index locations >= start and " + "< stop" + ) self.coordinates = where except: @@ -4089,21 +4239,27 @@ def generate(self, where): q = self.table.queryables() try: return Expr(where, queryables=q, encoding=self.table.encoding) - except (NameError) as detail: - - # raise a nice message, suggesting that the user should use data_columns - raise ValueError("The passed where expression: {0}\n" - " contains an invalid variable reference\n" - " all of the variable refrences must be a reference to\n" - " an axis (e.g. 'index' or 'columns'), or a data_column\n" - " The currently defined references are: {1}\n".format(where,','.join(q.keys()))) + except NameError as detail: + # raise a nice message, suggesting that the user should use + # data_columns + raise ValueError( + "The passed where expression: {0}\n" + " contains an invalid variable reference\n" + " all of the variable refrences must be a " + "reference to\n" + " an axis (e.g. 'index' or 'columns'), or a " + "data_column\n" + " The currently defined references are: {1}\n" + .format(where, ','.join(q.keys())) + ) def select(self): """ generate the selection """ if self.condition is not None: - return self.table.table.readWhere(self.condition.format(), start=self.start, stop=self.stop) + return self.table.table.readWhere(self.condition.format(), + start=self.start, stop=self.stop) elif self.coordinates is not None: return self.table.table.readCoordinates(self.coordinates) return self.table.table.read(start=self.start, stop=self.stop) @@ -4115,7 +4271,9 @@ def select_coords(self): if self.condition is None: return np.arange(self.table.nrows) - return self.table.table.getWhereList(self.condition.format(), start=self.start, stop=self.stop, sort=True) + return self.table.table.getWhereList(self.condition.format(), + start=self.start, stop=self.stop, + sort=True) # utilities ### diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 1d0d1d17ec631..8c172db162cd6 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2,9 +2,9 @@ Module contains tools for processing Stata files into DataFrames The StataReader below was originally written by Joe Presbrey as part of PyDTA. -It has been extended and improved by Skipper Seabold from the Statsmodels project -who also developed the StataWriter and was finally added to pandas in an once again -improved version. +It has been extended and improved by Skipper Seabold from the Statsmodels +project who also developed the StataWriter and was finally added to pandas in +an once again improved version. You can find more information on http://presbrey.mit.edu/PyDTA and http://statsmodels.sourceforge.net/devel/ @@ -25,7 +25,8 @@ from pandas.io.common import get_filepath_or_buffer -def read_stata(filepath_or_buffer, convert_dates=True, convert_categoricals=True, encoding=None, index=None): +def read_stata(filepath_or_buffer, convert_dates=True, + convert_categoricals=True, encoding=None, index=None): """ Read Stata file into DataFrame @@ -63,7 +64,8 @@ def _stata_elapsed_date_to_datetime(date, fmt): Examples -------- - >>> _stata_elapsed_date_to_datetime(52, "%tw") datetime.datetime(1961, 1, 1, 0, 0) + >>> _stata_elapsed_date_to_datetime(52, "%tw") + datetime.datetime(1961, 1, 1, 0, 0) Notes ----- @@ -199,8 +201,11 @@ def __init__(self, offset, value): '.' or ('.' + chr(value - offset + 96)) else: self._str = '.' - string = property(lambda self: self._str, doc="The Stata representation of the missing value: '.', '.a'..'.z'") - value = property(lambda self: self._value, doc='The binary representation of the missing value.') + string = property(lambda self: self._str, + doc="The Stata representation of the missing value: " + "'.', '.a'..'.z'") + value = property(lambda self: self._value, + doc='The binary representation of the missing value.') def __unicode__(self): return self.string @@ -292,19 +297,22 @@ def _decode_bytes(self, str, errors=None): class StataReader(StataParser): """ - Class for working with a Stata dataset. There are two possibilities for usage: + Class for working with a Stata dataset. There are two possibilities for + usage: * The from_dta() method on the DataFrame class. - This will return a DataFrame with the Stata dataset. Note that when using the - from_dta() method, you will not have access to meta-information like variable - labels or the data label. - - * Work with this object directly. Upon instantiation, the header of the Stata data - file is read, giving you access to attributes like variable_labels(), data_label(), - nobs(), ... A DataFrame with the data is returned by the read() method; this will - also fill up the value_labels. Note that calling the value_labels() method will - result in an error if the read() method has not been called yet. This is because - the value labels are stored at the end of a Stata dataset, after the data. + This will return a DataFrame with the Stata dataset. Note that when + using the from_dta() method, you will not have access to + meta-information like variable labels or the data label. + + * Work with this object directly. Upon instantiation, the header of the + Stata data file is read, giving you access to attributes like + variable_labels(), data_label(), nobs(), ... A DataFrame with the data + is returned by the read() method; this will also fill up the + value_labels. Note that calling the value_labels() method will result in + an error if the read() method has not been called yet. This is because + the value labels are stored at the end of a Stata dataset, after the + data. Parameters ---------- @@ -323,7 +331,9 @@ def __init__(self, path_or_buf, encoding='cp1252'): self._data_read = False self._value_labels_read = False if isinstance(path_or_buf, str): - path_or_buf, encoding = get_filepath_or_buffer(path_or_buf, encoding=self._default_encoding) + path_or_buf, encoding = get_filepath_or_buffer( + path_or_buf, encoding=self._default_encoding + ) if isinstance(path_or_buf, (str, compat.text_type, bytes)): self.path_or_buf = open(path_or_buf, 'rb') @@ -334,17 +344,22 @@ def __init__(self, path_or_buf, encoding='cp1252'): def _read_header(self): first_char = self.path_or_buf.read(1) - if struct.unpack('c', first_char)[0] == b'<': # format 117 or higher (XML like) + if struct.unpack('c', first_char)[0] == b'<': + # format 117 or higher (XML like) self.path_or_buf.read(27) # stata_dta>
self.format_version = int(self.path_or_buf.read(3)) if self.format_version not in [117]: - raise ValueError("Version of given Stata file is not 104, 105, 108, 113 (Stata 8/9), 114 (Stata 10/11), 115 (Stata 12) or 117 (Stata 13)") + raise ValueError("Version of given Stata file is not 104, " + "105, 108, 113 (Stata 8/9), 114 (Stata " + "10/11), 115 (Stata 12) or 117 (Stata 13)") self.path_or_buf.read(21) # self.byteorder = self.path_or_buf.read(3) == "MSF" and '>' or '<' self.path_or_buf.read(15) # - self.nvar = struct.unpack(self.byteorder + 'H', self.path_or_buf.read(2))[0] + self.nvar = struct.unpack(self.byteorder + 'H', + self.path_or_buf.read(2))[0] self.path_or_buf.read(7) # - self.nobs = struct.unpack(self.byteorder + 'I', self.path_or_buf.read(4))[0] + self.nobs = struct.unpack(self.byteorder + 'I', + self.path_or_buf.read(4))[0] self.path_or_buf.read(11) #
self.path_or_buf.read(8) # 0x0000000000000000 self.path_or_buf.read(8) # position of - seek_vartypes = struct.unpack(self.byteorder + 'q', self.path_or_buf.read(8))[0] + 16 - seek_varnames = struct.unpack(self.byteorder + 'q', self.path_or_buf.read(8))[0] + 10 - seek_sortlist = struct.unpack(self.byteorder + 'q', self.path_or_buf.read(8))[0] + 10 - seek_formats = struct.unpack(self.byteorder + 'q', self.path_or_buf.read(8))[0] + 9 - seek_value_label_names = struct.unpack(self.byteorder + 'q', self.path_or_buf.read(8))[0] + 19 - seek_variable_labels = struct.unpack(self.byteorder + 'q', self.path_or_buf.read(8))[0] + 17 + seek_vartypes = struct.unpack( + self.byteorder + 'q', self.path_or_buf.read(8))[0] + 16 + seek_varnames = struct.unpack( + self.byteorder + 'q', self.path_or_buf.read(8))[0] + 10 + seek_sortlist = struct.unpack( + self.byteorder + 'q', self.path_or_buf.read(8))[0] + 10 + seek_formats = struct.unpack( + self.byteorder + 'q', self.path_or_buf.read(8))[0] + 9 + seek_value_label_names = struct.unpack( + self.byteorder + 'q', self.path_or_buf.read(8))[0] + 19 + seek_variable_labels = struct.unpack( + self.byteorder + 'q', self.path_or_buf.read(8))[0] + 17 self.path_or_buf.read(8) # - self.data_location = struct.unpack(self.byteorder + 'q', self.path_or_buf.read(8))[0] + 6 - self.seek_strls = struct.unpack(self.byteorder + 'q', self.path_or_buf.read(8))[0] + 7 - self.seek_value_labels = struct.unpack(self.byteorder + 'q', self.path_or_buf.read(8))[0] + 14 + self.data_location = struct.unpack( + self.byteorder + 'q', self.path_or_buf.read(8))[0] + 6 + self.seek_strls = struct.unpack( + self.byteorder + 'q', self.path_or_buf.read(8))[0] + 7 + self.seek_value_labels = struct.unpack( + self.byteorder + 'q', self.path_or_buf.read(8))[0] + 14 #self.path_or_buf.read(8) # #self.path_or_buf.read(8) # EOF self.path_or_buf.seek(seek_vartypes) - typlist = [struct.unpack(self.byteorder + 'H', self.path_or_buf.read(2))[0] for i in range(self.nvar)] + typlist = [struct.unpack(self.byteorder + 'H', + self.path_or_buf.read(2))[0] + for i in range(self.nvar)] self.typlist = [None]*self.nvar try: i = 0 @@ -378,7 +404,8 @@ def _read_header(self): self.typlist[i] = self.TYPE_MAP_XML[typ] i += 1 except: - raise ValueError("cannot convert stata types [{0}]".format(','.join(typlist))) + raise ValueError("cannot convert stata types [{0}]" + .format(','.join(typlist))) self.dtyplist = [None]*self.nvar try: i = 0 @@ -389,33 +416,45 @@ def _read_header(self): self.dtyplist[i] = self.DTYPE_MAP_XML[typ] i += 1 except: - raise ValueError("cannot convert stata dtypes [{0}]".format(','.join(typlist))) + raise ValueError("cannot convert stata dtypes [{0}]" + .format(','.join(typlist))) self.path_or_buf.seek(seek_varnames) - self.varlist = [self._null_terminate(self.path_or_buf.read(33)) for i in range(self.nvar)] + self.varlist = [self._null_terminate(self.path_or_buf.read(33)) + for i in range(self.nvar)] self.path_or_buf.seek(seek_sortlist) - self.srtlist = struct.unpack(self.byteorder + ('h' * (self.nvar + 1)), self.path_or_buf.read(2 * (self.nvar + 1)))[:-1] + self.srtlist = struct.unpack( + self.byteorder + ('h' * (self.nvar + 1)), + self.path_or_buf.read(2 * (self.nvar + 1)) + )[:-1] self.path_or_buf.seek(seek_formats) - self.fmtlist = [self._null_terminate(self.path_or_buf.read(49)) for i in range(self.nvar)] + self.fmtlist = [self._null_terminate(self.path_or_buf.read(49)) + for i in range(self.nvar)] self.path_or_buf.seek(seek_value_label_names) - self.lbllist = [self._null_terminate(self.path_or_buf.read(33)) for i in range(self.nvar)] + self.lbllist = [self._null_terminate(self.path_or_buf.read(33)) + for i in range(self.nvar)] self.path_or_buf.seek(seek_variable_labels) - self.vlblist = [self._null_terminate(self.path_or_buf.read(81)) for i in range(self.nvar)] + self.vlblist = [self._null_terminate(self.path_or_buf.read(81)) + for i in range(self.nvar)] else: # header self.format_version = struct.unpack('b', first_char)[0] if self.format_version not in [104, 105, 108, 113, 114, 115]: - raise ValueError("Version of given Stata file is not 104, 105, 108, 113 (Stata 8/9), 114 (Stata 10/11), 115 (Stata 12) or 117 (Stata 13)") + raise ValueError("Version of given Stata file is not 104, " + "105, 108, 113 (Stata 8/9), 114 (Stata " + "10/11), 115 (Stata 12) or 117 (Stata 13)") self.byteorder = self.path_or_buf.read(1) == 0x1 and '>' or '<' self.filetype = struct.unpack('b', self.path_or_buf.read(1))[0] self.path_or_buf.read(1) # unused - self.nvar = struct.unpack(self.byteorder + 'H', self.path_or_buf.read(2))[0] - self.nobs = struct.unpack(self.byteorder + 'I', self.path_or_buf.read(4))[0] + self.nvar = struct.unpack(self.byteorder + 'H', + self.path_or_buf.read(2))[0] + self.nobs = struct.unpack(self.byteorder + 'I', + self.path_or_buf.read(4))[0] if self.format_version > 105: self.data_label = self.path_or_buf.read(81) else: @@ -425,51 +464,73 @@ def _read_header(self): # descriptors if self.format_version > 108: - typlist = [ord(self.path_or_buf.read(1)) for i in range(self.nvar)] + typlist = [ord(self.path_or_buf.read(1)) + for i in range(self.nvar)] else: - typlist = [self.OLD_TYPE_MAPPING[self._decode_bytes(self.path_or_buf.read(1))] for i in range(self.nvar)] + typlist = [ + self.OLD_TYPE_MAPPING[ + self._decode_bytes(self.path_or_buf.read(1)) + ] for i in range(self.nvar) + ] try: self.typlist = [self.TYPE_MAP[typ] for typ in typlist] except: - raise ValueError("cannot convert stata types [{0}]".format(','.join(typlist))) + raise ValueError("cannot convert stata types [{0}]" + .format(','.join(typlist))) try: self.dtyplist = [self.DTYPE_MAP[typ] for typ in typlist] except: - raise ValueError("cannot convert stata dtypes [{0}]".format(','.join(typlist))) + raise ValueError("cannot convert stata dtypes [{0}]" + .format(','.join(typlist))) if self.format_version > 108: - self.varlist = [self._null_terminate(self.path_or_buf.read(33)) for i in range(self.nvar)] + self.varlist = [self._null_terminate(self.path_or_buf.read(33)) + for i in range(self.nvar)] else: - self.varlist = [self._null_terminate(self.path_or_buf.read(9)) for i in range(self.nvar)] - self.srtlist = struct.unpack(self.byteorder + ('h' * (self.nvar + 1)), self.path_or_buf.read(2 * (self.nvar + 1)))[:-1] + self.varlist = [self._null_terminate(self.path_or_buf.read(9)) + for i in range(self.nvar)] + self.srtlist = struct.unpack( + self.byteorder + ('h' * (self.nvar + 1)), + self.path_or_buf.read(2 * (self.nvar + 1)) + )[:-1] if self.format_version > 113: - self.fmtlist = [self._null_terminate(self.path_or_buf.read(49)) for i in range(self.nvar)] + self.fmtlist = [self._null_terminate(self.path_or_buf.read(49)) + for i in range(self.nvar)] elif self.format_version > 104: - self.fmtlist = [self._null_terminate(self.path_or_buf.read(12)) for i in range(self.nvar)] + self.fmtlist = [self._null_terminate(self.path_or_buf.read(12)) + for i in range(self.nvar)] else: - self.fmtlist = [self._null_terminate(self.path_or_buf.read(7)) for i in range(self.nvar)] + self.fmtlist = [self._null_terminate(self.path_or_buf.read(7)) + for i in range(self.nvar)] if self.format_version > 108: - self.lbllist = [self._null_terminate(self.path_or_buf.read(33)) for i in range(self.nvar)] + self.lbllist = [self._null_terminate(self.path_or_buf.read(33)) + for i in range(self.nvar)] else: - self.lbllist = [self._null_terminate(self.path_or_buf.read(9)) for i in range(self.nvar)] + self.lbllist = [self._null_terminate(self.path_or_buf.read(9)) + for i in range(self.nvar)] if self.format_version > 105: - self.vlblist = [self._null_terminate(self.path_or_buf.read(81)) for i in range(self.nvar)] + self.vlblist = [self._null_terminate(self.path_or_buf.read(81)) + for i in range(self.nvar)] else: - self.vlblist = [self._null_terminate(self.path_or_buf.read(32)) for i in range(self.nvar)] + self.vlblist = [self._null_terminate(self.path_or_buf.read(32)) + for i in range(self.nvar)] # ignore expansion fields (Format 105 and later) - # When reading, read five bytes; the last four bytes now tell you the - # size of the next read, which you discard. You then continue like - # this until you read 5 bytes of zeros. + # When reading, read five bytes; the last four bytes now tell you + # the size of the next read, which you discard. You then continue + # like this until you read 5 bytes of zeros. if self.format_version > 104: while True: - data_type = struct.unpack(self.byteorder + 'b', self.path_or_buf.read(1))[0] + data_type = struct.unpack(self.byteorder + 'b', + self.path_or_buf.read(1))[0] if self.format_version > 108: - data_len = struct.unpack(self.byteorder + 'i', self.path_or_buf.read(4))[0] + data_len = struct.unpack(self.byteorder + 'i', + self.path_or_buf.read(4))[0] else: - data_len = struct.unpack(self.byteorder + 'h', self.path_or_buf.read(2))[0] + data_len = struct.unpack(self.byteorder + 'h', + self.path_or_buf.read(2))[0] if data_type == 0: break self.path_or_buf.read(data_len) @@ -477,13 +538,15 @@ def _read_header(self): # necessary data to continue parsing self.data_location = self.path_or_buf.tell() - self.has_string_data = len([x for x in self.typlist if type(x) is int]) > 0 + self.has_string_data = len([x for x in self.typlist + if type(x) is int]) > 0 """Calculate size of a data record.""" self.col_sizes = lmap(lambda x: self._calcsize(x), self.typlist) def _calcsize(self, fmt): - return type(fmt) is int and fmt or struct.calcsize(self.byteorder + fmt) + return (type(fmt) is int and fmt + or struct.calcsize(self.byteorder + fmt)) def _col_size(self, k=None): if k is None: @@ -503,7 +566,8 @@ def _unpack(self, fmt, byt): return d def _null_terminate(self, s): - if compat.PY3 or self._encoding is not None: # have bytes not strings, so must decode + if compat.PY3 or self._encoding is not None: # have bytes not strings, + # so must decode null_byte = b"\0" try: s = s[:s.index(null_byte)] @@ -523,14 +587,24 @@ def _next(self): data = [None] * self.nvar for i in range(len(data)): if type(typlist[i]) is int: - data[i] = self._null_terminate(self.path_or_buf.read(typlist[i])) + data[i] = self._null_terminate( + self.path_or_buf.read(typlist[i]) + ) else: - data[i] = self._unpack(typlist[i], self.path_or_buf.read(self._col_size(i))) + data[i] = self._unpack( + typlist[i], self.path_or_buf.read(self._col_size(i)) + ) return data else: - return list(map(lambda i: self._unpack(typlist[i], - self.path_or_buf.read(self._col_size(i))), - range(self.nvar))) + return list( + map( + lambda i: self._unpack(typlist[i], + self.path_or_buf.read( + self._col_size(i) + )), + range(self.nvar) + ) + ) def _dataset(self): """ @@ -562,14 +636,17 @@ def _read_value_labels(self): self.path_or_buf.seek(self.seek_value_labels) else: if not self._data_read: - raise Exception("Data has not been read. Because of the layout of Stata files, this is necessary before reading value labels.") + raise Exception("Data has not been read. Because of the " + "layout of Stata files, this is necessary " + "before reading value labels.") if self._value_labels_read: raise Exception("Value labels have already been read.") self.value_label_dict = dict() if self.format_version <= 108: - return # Value labels are not supported in version 108 and earlier. + # Value labels are not supported in version 108 and earlier. + return while True: if self.format_version >= 117: @@ -582,18 +659,24 @@ def _read_value_labels(self): labname = self._null_terminate(self.path_or_buf.read(33)) self.path_or_buf.read(3) # padding - n = struct.unpack(self.byteorder + 'I', self.path_or_buf.read(4))[0] - txtlen = struct.unpack(self.byteorder + 'I', self.path_or_buf.read(4))[0] + n = struct.unpack(self.byteorder + 'I', + self.path_or_buf.read(4))[0] + txtlen = struct.unpack(self.byteorder + 'I', + self.path_or_buf.read(4))[0] off = [] for i in range(n): - off.append(struct.unpack(self.byteorder + 'I', self.path_or_buf.read(4))[0]) + off.append(struct.unpack(self.byteorder + 'I', + self.path_or_buf.read(4))[0]) val = [] for i in range(n): - val.append(struct.unpack(self.byteorder + 'I', self.path_or_buf.read(4))[0]) + val.append(struct.unpack(self.byteorder + 'I', + self.path_or_buf.read(4))[0]) txt = self.path_or_buf.read(txtlen) self.value_label_dict[labname] = dict() for i in range(n): - self.value_label_dict[labname][val[i]] = self._null_terminate(txt[off[i]:]) + self.value_label_dict[labname][val[i]] = ( + self._null_terminate(txt[off[i]:]) + ) if self.format_version >= 117: self.path_or_buf.read(6) # @@ -606,9 +689,11 @@ def _read_strls(self): if self.path_or_buf.read(3) != b'GSO': break - v_o = struct.unpack(self.byteorder + 'L', self.path_or_buf.read(8))[0] + v_o = struct.unpack(self.byteorder + 'L', + self.path_or_buf.read(8))[0] typ = self.path_or_buf.read(1) - length = struct.unpack(self.byteorder + 'I', self.path_or_buf.read(4))[0] + length = struct.unpack(self.byteorder + 'I', + self.path_or_buf.read(4))[0] self.GSO[v_o] = self.path_or_buf.read(length-1) self.path_or_buf.read(1) # zero-termination @@ -621,7 +706,8 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None): convert_dates : boolean, defaults to True Convert date variables to DataFrame time values convert_categoricals : boolean, defaults to True - Read value labels and convert columns to Categorical/Factor variables + Read value labels and convert columns to Categorical/Factor + variables index : identifier of index column identifier of column that should be used as index of the DataFrame @@ -659,21 +745,28 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None): if self.dtyplist[i] is not None: col = data.columns[i] if data[col].dtype is not np.dtype(object): - data[col] = Series(data[col], data[col].index, self.dtyplist[i]) + data[col] = Series(data[col], data[col].index, + self.dtyplist[i]) if convert_dates: - cols = np.where(lmap(lambda x: x in _date_formats, self.fmtlist))[0] + cols = np.where(lmap(lambda x: x in _date_formats, + self.fmtlist))[0] for i in cols: col = data.columns[i] - data[col] = data[col].apply(_stata_elapsed_date_to_datetime, args=(self.fmtlist[i],)) + data[col] = data[col].apply(_stata_elapsed_date_to_datetime, + args=(self.fmtlist[i],)) if convert_categoricals: - cols = np.where(lmap(lambda x: x in compat.iterkeys(self.value_label_dict), self.lbllist))[0] + cols = np.where( + lmap(lambda x: x in compat.iterkeys(self.value_label_dict), + self.lbllist) + )[0] for i in cols: col = data.columns[i] labeled_data = np.copy(data[col]) labeled_data = labeled_data.astype(object) - for k, v in compat.iteritems(self.value_label_dict[self.lbllist[i]]): + for k, v in compat.iteritems( + self.value_label_dict[self.lbllist[i]]): labeled_data[(data[col] == k).values] = v data[col] = Categorical.from_array(labeled_data) @@ -684,11 +777,15 @@ def data_label(self): return self.data_label def variable_labels(self): - """Returns variable labels as a dict, associating each variable name with corresponding label""" + """Returns variable labels as a dict, associating each variable name + with corresponding label + """ return dict(zip(self.varlist, self.vlblist)) def value_labels(self): - """Returns a dict, associating each variable name a dict, associating each value its corresponding label""" + """Returns a dict, associating each variable name a dict, associating + each value its corresponding label + """ if not self._value_labels_read: self._read_value_labels() @@ -745,7 +842,9 @@ def _maybe_convert_to_int_keys(convert_dates, varlist): new_dict.update({varlist.index(key): convert_dates[key]}) else: if not isinstance(key, int): - raise ValueError("convery_dates key is not in varlist and is not an int") + raise ValueError( + "convery_dates key is not in varlist and is not an int" + ) new_dict.update({key: convert_dates[key]}) return new_dict @@ -769,7 +868,8 @@ def _dtype_to_stata_type(dtype): if dtype.type == np.string_: return chr(dtype.itemsize) elif dtype.type == np.object_: # try to coerce it to the biggest string - # not memory efficient, what else could we do? + # not memory efficient, what else could we + # do? return chr(244) elif dtype == np.float64: return chr(255) @@ -856,8 +956,8 @@ class StataWriter(StataParser): >>> writer = StataWriter('./date_data_file.dta', date, {2 : 'tw'}) >>> writer.write_file() """ - def __init__(self, fname, data, convert_dates=None, write_index=True, encoding="latin-1", - byteorder=None): + def __init__(self, fname, data, convert_dates=None, write_index=True, + encoding="latin-1", byteorder=None): super(StataWriter, self).__init__(encoding) self._convert_dates = convert_dates self._write_index = write_index @@ -867,7 +967,9 @@ def __init__(self, fname, data, convert_dates=None, write_index=True, encoding=" if byteorder is None: byteorder = sys.byteorder self._byteorder = _set_endianness(byteorder) - self._file = _open_file_binary_write(fname, self._encoding or self._default_encoding) + self._file = _open_file_binary_write( + fname, self._encoding or self._default_encoding + ) self.type_converters = {253: np.long, 252: int} def _write(self, to_write): @@ -875,7 +977,8 @@ def _write(self, to_write): Helper to call encode before writing to file for Python 3 compat. """ if compat.PY3: - self._file.write(to_write.encode(self._encoding or self._default_encoding)) + self._file.write(to_write.encode(self._encoding or + self._default_encoding)) else: self._file.write(to_write) @@ -898,9 +1001,13 @@ def __iter__(self): self.varlist = data.columns.tolist() dtypes = data.dtypes if self._convert_dates is not None: - self._convert_dates = _maybe_convert_to_int_keys(self._convert_dates, self.varlist) + self._convert_dates = _maybe_convert_to_int_keys( + self._convert_dates, self.varlist + ) for key in self._convert_dates: - new_type = _convert_datetime_to_stata_type(self._convert_dates[key]) + new_type = _convert_datetime_to_stata_type( + self._convert_dates[key] + ) dtypes[key] = np.dtype(new_type) self.typlist = [_dtype_to_stata_type(dt) for dt in dtypes] self.fmtlist = [_dtype_to_default_stata_fmt(dt) for dt in dtypes] @@ -940,14 +1047,18 @@ def _write_header(self, data_label=None, time_stamp=None): if data_label is None: self._file.write(self._null_terminate(_pad_bytes("", 80))) else: - self._file.write(self._null_terminate(_pad_bytes(data_label[:80], 80))) + self._file.write( + self._null_terminate(_pad_bytes(data_label[:80], 80)) + ) # time stamp, 18 bytes, char, null terminated # format dd Mon yyyy hh:mm if time_stamp is None: time_stamp = datetime.datetime.now() elif not isinstance(time_stamp, datetime): raise ValueError("time_stamp should be datetime type") - self._file.write(self._null_terminate(time_stamp.strftime("%d %b %Y %H:%M"))) + self._file.write( + self._null_terminate(time_stamp.strftime("%d %b %Y %H:%M")) + ) def _write_descriptors(self, typlist=None, varlist=None, srtlist=None, fmtlist=None, lbllist=None): @@ -996,7 +1107,8 @@ def _write_data_nodates(self): self._write(var) else: try: - self._file.write(struct.pack(byteorder + TYPE_MAP[typ], var)) + self._file.write(struct.pack(byteorder + TYPE_MAP[typ], + var)) except struct.error: # have to be strict about type pack won't do any # kind of casting diff --git a/pandas/io/wb.py b/pandas/io/wb.py index a585cb9adccbb..362b7b192f746 100644 --- a/pandas/io/wb.py +++ b/pandas/io/wb.py @@ -32,21 +32,22 @@ def download(country=['MX', 'CA', 'US'], indicator=['GDPPCKD', 'GDPPCKN'], """ # Are ISO-2 country codes valid? - valid_countries = ["AG", "AL", "AM", "AO", "AR", "AT", "AU", "AZ", "BB", - "BD", "BE", "BF", "BG", "BH", "BI", "BJ", "BO", "BR", "BS", "BW", - "BY", "BZ", "CA", "CD", "CF", "CG", "CH", "CI", "CL", "CM", "CN", - "CO", "CR", "CV", "CY", "CZ", "DE", "DK", "DM", "DO", "DZ", "EC", - "EE", "EG", "ER", "ES", "ET", "FI", "FJ", "FR", "GA", "GB", "GE", - "GH", "GM", "GN", "GQ", "GR", "GT", "GW", "GY", "HK", "HN", "HR", - "HT", "HU", "ID", "IE", "IL", "IN", "IR", "IS", "IT", "JM", "JO", - "JP", "KE", "KG", "KH", "KM", "KR", "KW", "KZ", "LA", "LB", "LC", - "LK", "LS", "LT", "LU", "LV", "MA", "MD", "MG", "MK", "ML", "MN", - "MR", "MU", "MW", "MX", "MY", "MZ", "NA", "NE", "NG", "NI", "NL", - "NO", "NP", "NZ", "OM", "PA", "PE", "PG", "PH", "PK", "PL", "PT", - "PY", "RO", "RU", "RW", "SA", "SB", "SC", "SD", "SE", "SG", "SI", - "SK", "SL", "SN", "SR", "SV", "SY", "SZ", "TD", "TG", "TH", "TN", - "TR", "TT", "TW", "TZ", "UA", "UG", "US", "UY", "UZ", "VC", "VE", - "VN", "VU", "YE", "ZA", "ZM", "ZW", "all"] + valid_countries = [ + "AG", "AL", "AM", "AO", "AR", "AT", "AU", "AZ", "BB", "BD", "BE", "BF", + "BG", "BH", "BI", "BJ", "BO", "BR", "BS", "BW", "BY", "BZ", "CA", "CD", + "CF", "CG", "CH", "CI", "CL", "CM", "CN", "CO", "CR", "CV", "CY", "CZ", + "DE", "DK", "DM", "DO", "DZ", "EC", "EE", "EG", "ER", "ES", "ET", "FI", + "FJ", "FR", "GA", "GB", "GE", "GH", "GM", "GN", "GQ", "GR", "GT", "GW", + "GY", "HK", "HN", "HR", "HT", "HU", "ID", "IE", "IL", "IN", "IR", "IS", + "IT", "JM", "JO", "JP", "KE", "KG", "KH", "KM", "KR", "KW", "KZ", "LA", + "LB", "LC", "LK", "LS", "LT", "LU", "LV", "MA", "MD", "MG", "MK", "ML", + "MN", "MR", "MU", "MW", "MX", "MY", "MZ", "NA", "NE", "NG", "NI", "NL", + "NO", "NP", "NZ", "OM", "PA", "PE", "PG", "PH", "PK", "PL", "PT", "PY", + "RO", "RU", "RW", "SA", "SB", "SC", "SD", "SE", "SG", "SI", "SK", "SL", + "SN", "SR", "SV", "SY", "SZ", "TD", "TG", "TH", "TN", "TR", "TT", "TW", + "TZ", "UA", "UG", "US", "UY", "UZ", "VC", "VE", "VN", "VU", "YE", "ZA", + "ZM", "ZW", "all" + ] if type(country) == str: country = [country] bad_countries = np.setdiff1d(country, valid_countries) @@ -68,7 +69,8 @@ def download(country=['MX', 'CA', 'US'], indicator=['GDPPCKD', 'GDPPCKN'], # Warn if len(bad_indicators) > 0: print('Failed to obtain indicator(s): %s' % '; '.join(bad_indicators)) - print('The data may still be available for download at http://data.worldbank.org') + print('The data may still be available for download at ' + 'http://data.worldbank.org') if len(bad_countries) > 0: print('Invalid ISO-2 codes: %s' % ' '.join(bad_countries)) # Merge WDI series @@ -84,9 +86,9 @@ def download(country=['MX', 'CA', 'US'], indicator=['GDPPCKD', 'GDPPCKN'], def _get_data(indicator="NY.GNS.ICTR.GN.ZS", country='US', start=2002, end=2005): # Build URL for api call - url = "http://api.worldbank.org/countries/" + country + "/indicators/" + \ - indicator + "?date=" + str(start) + ":" + str(end) + "&per_page=25000" + \ - "&format=json" + url = ("http://api.worldbank.org/countries/" + country + "/indicators/" + + indicator + "?date=" + str(start) + ":" + str(end) + + "&per_page=25000&format=json") # Download with urlopen(url) as response: data = response.read()