diff --git a/RELEASE.rst b/RELEASE.rst index bcd41813fb91f..515d9bab794ec 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -38,6 +38,22 @@ pandas 0.8.0 - Add support for indexes (dates or otherwise) with duplicates and common sense indexing/selection functionality - Series/DataFrame.update methods, in-place variant of combine_first (#961) + - Add ``match`` function to API (#502) + - Add Cython-optimized first, last, min, max, prod functions to GroupBy (#994, + #1043) + - Dates can be split across multiple columns (#1227, #1186) + - Add experimental support for converting pandas DataFrame to R data.frame + via rpy2 (#350, #1212) + - Can pass list of (name, function) to GroupBy.aggregate to get aggregates in + a particular order (#610) + - Can pass dicts with lists of functions or dicts to GroupBy aggregate to do + much more flexible multiple function aggregation (#642) + - New ordered_merge functions for merging DataFrames with ordered + data. Also supports group-wise merging for panel data (#813) + - Add keys() method to DataFrame + - Add flexible replace method for replacing potentially values to Series and + DataFrame (#929, #1241) + - Add 'kde' plot kind for Series/DataFrame.plot (#1059) **Improvements to existing features** @@ -50,6 +66,12 @@ pandas 0.8.0 - Can pass arrays in addition to column names to DataFrame.set_index (#402) - Improve the speed of "square" reindexing of homogeneous DataFrame objects by significant margin (#836) + - Handle more dtypes when passed MaskedArrays in DataFrame constructor (#406) + - Improved performance of join operations on integer keys (#682) + - Can pass multiple columns to GroupBy object, e.g. grouped[[col1, col2]] to + only aggregate a subset of the value columns (#383) + - Add histogram / kde plot options for scatter_matrix diagonals (#1237) + - Add inplace option to DataFrame.drop_duplicates (#805) **API Changes** @@ -57,6 +79,8 @@ pandas 0.8.0 (#1073) - Change BDay (business day) to not normalize dates by default - Remove deprecated DataMatrix name + - Default merge suffixes for overlap now have underscores instead of periods + to facilitate tab completion, etc. (#1239) **Bug fixes** @@ -76,6 +100,10 @@ pandas 0.8.0 cases. Fix pivot table bug (#1181) - Fix formatting of MultiIndex on Series/DataFrame when index name coincides with label (#1217) + - Handle Excel 2003 #N/A as NaN from xlrd (#1213, #1225) + - Fix timestamp locale-related deserialization issues with HDFStore by moving + to datetime64 representation (#1081, #809) + - Fix DataFrame.duplicated/drop_duplicates NA value handling (#557) pandas 0.7.3 ============ diff --git a/doc/make.py b/doc/make.py index 8597b2efb7f7c..98767ae67ce43 100755 --- a/doc/make.py +++ b/doc/make.py @@ -25,35 +25,29 @@ SPHINX_BUILD = 'sphinxbuild' -def sf(): - 'push a copy to the sf' - os.system('cd build/html; rsync -avz . wesmckinn,pandas@web.sf.net' - ':/home/groups/p/pa/pandas/htdocs/ -essh --cvs-exclude') - def upload_dev(): 'push a copy to the pydata dev directory' - os.system('cd build/html; rsync -avz . pandas@pandas.pydata.org' - ':/usr/share/nginx/pandas/pandas-docs/dev/ -essh') + if os.system('cd build/html; rsync -avz . pandas@pandas.pydata.org' + ':/usr/share/nginx/pandas/pandas-docs/dev/ -essh'): + raise SystemExit('Upload to Pydata Dev failed') def upload_dev_pdf(): 'push a copy to the pydata dev directory' - os.system('cd build/latex; scp pandas.pdf pandas@pandas.pydata.org' - ':/usr/share/nginx/pandas/pandas-docs/dev/') + if os.system('cd build/latex; scp pandas.pdf pandas@pandas.pydata.org' + ':/usr/share/nginx/pandas/pandas-docs/dev/'): + raise SystemExit('PDF upload to Pydata Dev failed') def upload_stable(): - 'push a copy to the pydata dev directory' - os.system('cd build/html; rsync -avz . pandas@pandas.pydata.org' - ':/usr/share/nginx/pandas/pandas-docs/stable/ -essh') + 'push a copy to the pydata stable directory' + if os.system('cd build/html; rsync -avz . pandas@pandas.pydata.org' + ':/usr/share/nginx/pandas/pandas-docs/stable/ -essh'): + raise SystemExit('Upload to stable failed') def upload_stable_pdf(): 'push a copy to the pydata dev directory' - os.system('cd build/latex; scp pandas.pdf pandas@pandas.pydata.org' - ':/usr/share/nginx/pandas/pandas-docs/stable/') - -def sfpdf(): - 'push a copy to the sf site' - os.system('cd build/latex; scp pandas.pdf wesmckinn,pandas@web.sf.net' - ':/home/groups/p/pa/pandas/htdocs/') + if os.system('cd build/latex; scp pandas.pdf pandas@pandas.pydata.org' + ':/usr/share/nginx/pandas/pandas-docs/stable/'): + raise SystemExit('PDF upload to stable failed') def clean(): if os.path.exists('build'): @@ -102,6 +96,80 @@ def all(): # clean() html() +def auto_dev_build(debug=False): + msg = '' + try: + clean() + html() + latex() + upload_dev() + upload_dev_pdf() + if not debug: + sendmail() + except (Exception, SystemExit), inst: + msg += str(inst) + '\n' + sendmail(msg) + +def sendmail(err_msg=None): + from_name, to_name = _get_config() + + if err_msg is None: + msgstr = 'Daily docs build completed successfully' + subject = "DOC: daily build successful" + else: + msgstr = err_msg + subject = "DOC: daily build failed" + + import smtplib + from email.MIMEText import MIMEText + msg = MIMEText(msgstr) + msg['Subject'] = subject + msg['From'] = from_name + msg['To'] = to_name + + server_str, port, login, pwd = _get_credentials() + server = smtplib.SMTP(server_str, port) + server.ehlo() + server.starttls() + server.ehlo() + + server.login(login, pwd) + try: + server.sendmail(from_name, to_name, msg.as_string()) + finally: + server.close() + +def _get_dir(): + import getpass + USERNAME = getpass.getuser() + if sys.platform == 'darwin': + HOME = '/Users/%s' % USERNAME + else: + HOME = '/home/%s' % USERNAME + + tmp_dir = '%s/tmp' % HOME + return tmp_dir + +def _get_credentials(): + tmp_dir = _get_dir() + cred = '%s/credentials' % tmp_dir + with open(cred, 'r') as fh: + server, port, un, domain = fh.read().split(',') + port = int(port) + login = un + '@' + domain + '.com' + + import base64 + with open('%s/cron_email_pwd' % tmp_dir, 'r') as fh: + pwd = base64.b64decode(fh.read()) + + return server, port, login, pwd + +def _get_config(): + tmp_dir = _get_dir() + with open('%s/config' % tmp_dir, 'r') as fh: + from_name, to_name = fh.read().split(',') + return from_name, to_name + funcd = { 'html' : html, 'upload_dev' : upload_dev, @@ -110,8 +178,8 @@ def all(): 'upload_stable_pdf' : upload_stable_pdf, 'latex' : latex, 'clean' : clean, - 'sf' : sf, - 'sfpdf' : sfpdf, + 'auto_dev' : auto_dev_build, + 'auto_debug' : lambda: auto_dev_build(True), 'all' : all, } diff --git a/doc/source/basics.rst b/doc/source/basics.rst index c038f5e953cb2..014bf7ea58f8a 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -491,7 +491,7 @@ With a DataFrame, you can simultaneously reindex the index and columns: df.reindex(index=['c', 'f', 'b'], columns=['three', 'two', 'one']) For convenience, you may utilize the ``reindex_axis`` method, which takes the -labels and a keyword ``axis`` paramater. +labels and a keyword ``axis`` parameter. Note that the ``Index`` objects containing the actual axis labels can be **shared** between objects. So if we have a Series and a DataFrame, the @@ -657,7 +657,7 @@ set of labels from an axis: df.drop(['a', 'd'], axis=0) df.drop(['one'], axis=1) -Note that the following also works, but a bit less obvious / clean: +Note that the following also works, but is a bit less obvious / clean: .. ipython:: python @@ -685,24 +685,25 @@ Series, it need only contain a subset of the labels as keys: df.rename(columns={'one' : 'foo', 'two' : 'bar'}, index={'a' : 'apple', 'b' : 'banana', 'd' : 'durian'}) -The ``rename`` method also provides a ``copy`` named parameter that is by -default ``True`` and copies the underlying data. Pass ``copy=False`` to rename -the data in place. +The ``rename`` method also provides an ``inplace`` named parameter that is by +default ``False`` and copies the underlying data. Pass ``inplace=True`` to +rename the data in place. .. _basics.rename_axis: -The Panel class has an a related ``rename_axis`` class which can rename any of +The Panel class has a related ``rename_axis`` class which can rename any of its three axes. Iteration --------- -Considering the pandas as somewhat dict-like structure, basic iteration -produces the "keys" of the objects, namely: +Because Series is array-like, basic iteration produces the values. Other data +structures follow the dict-like convention of iterating over the "keys" of the +objects. In short: - * **Series**: the index label - * **DataFrame**: the column labels - * **Panel**: the item labels + * **Series**: values + * **DataFrame**: column labels + * **Panel**: item labels Thus, for example: diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 25be861295395..f058eab89d067 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -171,10 +171,10 @@ accept the following arguments: - ``window``: size of moving window - ``min_periods``: threshold of non-null data points to require (otherwise result is NA) - - ``freq``: optionally specify a :ref: `frequency string ` or :ref:`DateOffset ` - to pre-conform the data to. Note that prior to pandas v0.8.0, a keyword - argument ``time_rule`` was used instead of ``freq`` that referred to - the legacy time rule constants + - ``freq``: optionally specify a :ref: `frequency string ` + or :ref:`DateOffset ` to pre-conform the data to. + Note that prior to pandas v0.8.0, a keyword argument ``time_rule`` was used + instead of ``freq`` that referred to the legacy time rule constants These functions can be applied to ndarrays or Series objects: diff --git a/doc/source/conf.py b/doc/source/conf.py index 970700ff4d275..f2fc6511143d8 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -209,7 +209,7 @@ latex_documents = [ ('index', 'pandas.tex', u'pandas: powerful Python data analysis toolkit', - u'Wes McKinney', 'manual'), + u'Wes McKinney\n& PyData Development Team', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 4f8c7166e5024..c2ef0d74ced53 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -200,7 +200,7 @@ of the DataFrame): Consider the ``isin`` method of Series, which returns a boolean vector that is true wherever the Series elements exist in the passed list. This allows you to -select out rows where one or more columns have values you want: +select rows where one or more columns have values you want: .. ipython:: python @@ -215,7 +215,7 @@ more complex criteria: .. ipython:: python # only want 'two' or 'three' - criterion = df2['a'].map(lambda x: x.startswith('t') + criterion = df2['a'].map(lambda x: x.startswith('t')) df2[criterion] @@ -319,7 +319,7 @@ Duplicate Data .. _indexing.duplicate: -If you want to indentify and remove duplicate rows in a DataFrame, there are +If you want to identify and remove duplicate rows in a DataFrame, there are two methods that will help: ``duplicated`` and ``drop_duplicates``. Each takes as an argument the columns to use to identify duplicated rows. @@ -567,9 +567,9 @@ Hierarchical indexing (MultiIndex) Hierarchical indexing (also referred to as "multi-level" indexing) is brand new in the pandas 0.4 release. It is very exciting as it opens the door to some quite sophisticated data analysis and manipulation, especially for working with -higher dimensional data. In essence, it enables you to effectively store and -manipulate arbitrarily high dimension data in a 2-dimensional tabular structure -(DataFrame), for example. It is not limited to DataFrame +higher dimensional data. In essence, it enables you to store and manipulate +data with an arbitrary number of dimensions in lower dimensional data +structures like Series (1d) and DataFrame (2d). In this section, we will show what exactly we mean by "hierarchical" indexing and how it integrates with the all of the pandas indexing functionality @@ -611,6 +611,7 @@ As a convenience, you can pass a list of arrays directly into Series or DataFrame to construct a MultiIndex automatically: .. ipython:: python + arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']), np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])] s = Series(randn(8), index=arrays) diff --git a/doc/source/io.rst b/doc/source/io.rst index 98a69ba504e87..caa2a8a80d6ae 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -59,7 +59,7 @@ The two workhorse functions for reading text files (a.k.a. flat files) are They both use the same parsing code to intelligently convert tabular data into a DataFrame object. They can take a number of arguments: - - ``path_or_buffer``: Either a string path to a file, or any object with a + - ``filepath_or_buffer``: Either a string path to a file, or any object with a ``read`` method (such as an open file or ``StringIO``). - ``sep`` or ``delimiter``: A delimiter / separator to split fields on. `read_csv` is capable of inferring the delimiter automatically in some diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index d724938c29451..293832e23c414 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -204,8 +204,7 @@ for interpolation methods outside of the filling methods described above. :suppress: np.random.seed(123456) - ts = Series(randn(100), index=date_range('1/1/2000', periods=100, - timeRule='EOM')) + ts = Series(randn(100), index=date_range('1/1/2000', periods=100, freq='BM')) ts[20:40] = np.nan ts[60:80] = np.nan ts = ts.cumsum() diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index e409a1a64961a..14629412c783a 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -4,28 +4,29 @@ .. ipython:: python :suppress: + from datetime import datetime import numpy as np np.random.seed(123456) from pandas import * randn = np.random.randn np.set_printoptions(precision=4, suppress=True) from dateutil import relativedelta - from pandas.core.datetools import * + from pandas.tseries.api import * ******************************** Time Series / Date functionality ******************************** pandas has proven very successful as a tool for working with time series data, -especially in the financial data analysis space. Over the coming year we will -be looking to consolidate the various Python libraries for time series data, -e.g. ``scikits.timeseries``, using the new NumPy ``datetime64`` dtype, to -create a very nice integrated solution. Everything in pandas at the moment is -based on using Python ``datetime`` objects. +especially in the financial data analysis space. With the 0.8 release, we have +further improved the time series API in pandas by leaps and bounds. Using the +new NumPy ``datetime64`` dtype, we have consolidated a large number of features +from other Python libraries like ``scikits.timeseries`` as well as created +a tremendous amount of new functionality for manipulating time series data. In working with time series data, we will frequently seek to: - - generate sequences of fixed-frequency dates + - generate sequences of fixed-frequency dates and time spans - conform or convert time series to a particular frequency - compute "relative" dates based on various non-standard time increments (e.g. 5 business days before the last business day of the year), or "roll" @@ -34,18 +35,85 @@ In working with time series data, we will frequently seek to: pandas provides a relatively compact and self-contained set of tools for performing the above tasks. -.. note:: +.. _timeseries.representation: + +Time Stamps vs. Time Spans +-------------------------- + +While most time series representations of data associates values with a time +stamp, in many cases it is more natural to associate the values with a given +time span. For example, it is easy to think of level variables at a +particular point in time, but much more intuitive to think of change variables +over spans of time. Starting with 0.8, pandas allows you to capture both +representations and convert between them. Under the hood, pandas represents +timestamps using instances of ``Timestamp`` and sequences of timestamps using +instances of ``DatetimeIndex``. For regular time spans, pandas uses ``Period`` +objects for scalar values and ``PeriodIndex`` for sequences of spans. +Better support for irregular intervals with arbitrary start and end points are +forth-coming in future releases. + +For example: - This area of pandas has gotten less development attention recently, though - this should change in the near future. +.. ipython:: python + + # Time stamped data + dates = [datetime(2012, 5, 1), datetime(2012, 5, 2), datetime(2012, 5, 3)] + ts = Series(np.random.randn(3), dates) + + type(ts.index) + + ts + + # Time span data + periods = PeriodIndex([Period('2012-01'), Period('2012-02'), + Period('2012-03')]) + ts = Series(np.random.randn(3), periods) + + type(ts.index) + + ts + +.. _timeseries.timestamprange: + +Generating Ranges of Timestamps +------------------------------- + +To generate an index with time stamps, you can use either the DatetimeIndex or +Index constructor and pass in a list of datetime objects: + +.. ipython:: python + + dates = [datetime(2012, 5, 1), datetime(2012, 5, 2), datetime(2012, 5, 3)] + index = DatetimeIndex(dates) + index # Note the frequency information + + index = Index(dates) + index # Automatically converted to DatetimeIndex + +Practically, this becomes very cumbersome because we often need a very long +index with a large number of timestamps. If we need timestamps on a regular +frequency, we can use the pandas functions ``date_range`` and ``bdate_range`` +to create timestamp indexes. + +.. ipython:: python + + index = date_range('2000-1-1', periods=1000, freq='M') + index + + index = bdate_range('2012-1-1', periods=250) + index .. _timeseries.offsets: DateOffset objects ------------------ -A ``DateOffset`` instance represents a frequency increment. Different offset -logic via subclasses: +In order to create the sequence of dates with a monthly frequency in the +previous example, we used the ``freq`` keyword and gave it 'M' as the input. +Under the hood, the string 'M' is being interpreted into an instance of pandas +``DateOffset``. ``DateOffset`` represents a regular frequency increment. +Specific offset logic like "business day" or "one hour" is represented in its +various subclasses. .. csv-table:: :header: "Class name", "Description" @@ -54,16 +122,24 @@ logic via subclasses: DateOffset, "Generic offset class, defaults to 1 calendar day" BDay, "business day (weekday)" Week, "one week, optionally anchored on a day of the week" + WeekOfMonth, "the x-th day of the y-th week of each month" MonthEnd, "calendar month end" + MonthBegin, "calendar month begin" BMonthEnd, "business month end" + BMonthBegin, "business month begin" QuarterEnd, "calendar quarter end" + QuarterBegin, "calendar quarter begin" BQuarterEnd, "business quarter end" + BQuarterBegin, "business quarter begin" YearEnd, "calendar year end" YearBegin, "calendar year begin" BYearEnd, "business year end" + BYearBegin, "business year begin" Hour, "one hour" Minute, "one minute" Second, "one second" + Milli, "one millisecond" + Micro, "one microsecond" The basic ``DateOffset`` takes the same arguments as ``dateutil.relativedelta``, which works like: @@ -113,7 +189,7 @@ The ``rollforward`` and ``rollback`` methods do exactly what you would expect: offset.rollforward(d) offset.rollback(d) -It's definitely worth exploring the ``pandas.core.datetools`` module and the +It's definitely worth exploring the ``pandas.tseries.offsets`` module and the various docstrings for the classes. Parametric offsets @@ -130,7 +206,14 @@ particular day of the week: d + Week(weekday=4) (d + Week(weekday=4)).weekday() -.. _timeseries.freq: +Another example is parameterizing ``YearEnd`` with the specific ending month: + +.. ipython:: python + + d + YearEnd() + d + YearEnd(month=6) + +.. _timeseries.alias: Offset Aliases ~~~~~~~~~~~~~~ @@ -202,9 +285,9 @@ For some frequencies you can specify an anchoring suffix: "(B)A(S)\-OCT", "annual frequency, anchored end of October" "(B)A(S)\-NOV", "annual frequency, anchored end of November" -These can be used as arguments to ``date_range``, ``period_range``, constructors -for ``PeriodIndex`` and ``DatetimeIndex``, as well as various other time -series-related functions in pandas. +These can be used as arguments to ``date_range``, ``bdate_range``, constructors +for ``DatetimeIndex``, as well as various other timeseries-related functions +in pandas. Note that prior to v0.8.0, time rules had a slightly different look. Pandas will continue to support the legacy time rules for the time being but it is @@ -242,67 +325,74 @@ strongly recommended that you switch to using the new offset aliases. "ms", "L" "us": "U" -Note that the legacy quarterly and annual frequencies are business quarter and -business year ends. Also note the legacy time rule for milliseconds ``ms`` -versus the new offset alias for month start ``MS``. This means that offset -alias parsing is case sensitive. +As you can see, legacy quarterly and annual frequencies are business quarter +and business year ends. Please also note the legacy time rule for milliseconds +``ms`` versus the new offset alias for month start ``MS``. This means that +offset alias parsing is case sensitive. .. _timeseries.daterange: -Generating date ranges (date_range) ----------------------------------- +More on date ranges +------------------- -The ``date_range`` class utilizes these offsets (and any ones that we might add) -to generate fixed-frequency date ranges: +Convenience functions like ``date_range`` and ``bdate_range`` utilizes the +offsets described above to generate fixed-frequency date ranges. The default +frequency for ``date_range`` is a **calendar day** while the default for +``bdate_range`` is a **business day** .. ipython:: python start = datetime(2009, 1, 1) end = datetime(2010, 1, 1) - rng = date_range(start, end, offset=BDay()) + rng = date_range(start, end) rng - date_range(start, end, offset=BMonthEnd()) -**Business day frequency** is the default for ``date_range``. You can also -strictly generate a ``date_range`` of a certain length by providing either a -start or end date and a ``periods`` argument: + rng = bdate_range(start, end) + rng -.. ipython:: python +``date_range`` and ``bdate_range`` makes it easy to generate a range of dates +using various combinations of its parameters like ``start``, ``end``, +``periods``, and ``freq``: + + date_range(start, end, freq=BMonthEnd()) - date_range(start, periods=20) - date_range(end=end, periods=20) + date_range(start, end, freq=3 * Week()) + + bdate_range(end=end, periods=20) + + bdate_range(start=start, periods=20) The start and end dates are strictly inclusive. So it will not generate any dates outside of those dates if specified. -date_range is a valid Index -~~~~~~~~~~~~~~~~~~~~~~~~~~ -One of the main uses for ``date_range`` is as an index for pandas objects. When -working with a lot of time series data, there are several reasons to use -``date_range`` objects when possible: +DatetimeIndex +~~~~~~~~~~~~~ + +One of the main uses for ``DatetimeIndex`` is as an index for pandas objects. +The ``DatetimeIndex`` class contains many timeseries related optimizations: - A large range of dates for various offsets are pre-computed and cached under the hood in order to make generating subsequent date ranges very fast (just have to grab a slice) - - Fast shifting using the ``shift`` method on pandas objects - - Unioning of overlapping date_range objects with the same frequency is very - fast (important for fast data alignment) + - Fast shifting using the ``shift`` and ``tshift`` method on pandas objects + - Unioning of overlapping DatetimeIndex objects with the same frequency is + very fast (important for fast data alignment) -The ``date_range`` is a valid index and can even be intelligent when doing -slicing, etc. +``DatetimeIndex`` can be used like a regular index and offers all of its +intelligent functionality like selection, slicing, etc. .. ipython:: python - rng = date_range(start, end, offset=BMonthEnd()) + rng = date_range(start, end, freq=BMonthEnd()) ts = Series(randn(len(rng)), index=rng) ts.index ts[:5].index ts[::2].index -More complicated fancy indexing will result in an ``Index`` that is no longer a -``date_range``, however: +However, complicated fancy indexing that breaks the DatetimeIndex's frequency +regularity will result in an ``Index`` that is no longer a ``DatetimeIndex``: .. ipython:: python @@ -335,12 +425,12 @@ and in Panel along the ``major_axis``. The shift method accepts an ``offset`` argument which can accept a ``DateOffset`` class or other ``timedelta``-like object or also a :ref:`time -rule `: +rule `: .. ipython:: python - ts.shift(5, offset=datetools.bday) - ts.shift(5, offset='EOM') + ts.shift(5, freq=datetools.bday) + ts.shift(5, freq='EOM') Frequency conversion ~~~~~~~~~~~~~~~~~~~~ @@ -351,7 +441,7 @@ generates a ``date_range`` and calls ``reindex``. .. ipython:: python - dr = date_range('1/1/2010', periods=3, offset=3 * datetools.bday) + dr = date_range('1/1/2010', periods=3, freq=3 * datetools.bday) ts = Series(randn(3), index=dr) ts ts.asfreq(BDay()) @@ -377,9 +467,9 @@ view) application of GroupBy. Carry out the following steps: .. code-block:: python - dr1hour = date_range(start, end, offset=Hour()) - dr5day = date_range(start, end, offset=5 * datetools.day) - dr10day = date_range(start, end, offset=10 * datetools.day) + dr1hour = date_range(start, end, freq=Hour()) + dr5day = date_range(start, end, freq=5 * datetools.day) + dr10day = date_range(start, end, freq=10 * datetools.day) 2. Use the ``asof`` function ("as of") of the date_range to do a groupby @@ -396,11 +486,11 @@ Here is a fully-worked example: # some minutely data minutely = date_range('1/3/2000 00:00:00', '1/3/2000 12:00:00', - offset=datetools.Minute()) + freq=datetools.Minute()) ts = Series(randn(len(minutely)), index=minutely) ts.index - hourly = date_range('1/3/2000', '1/4/2000', offset=datetools.Hour()) + hourly = date_range('1/3/2000', '1/4/2000', freq=datetools.Hour()) grouped = ts.groupby(hourly.asof) grouped.mean() diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index be969f3796935..6c035b816a9e9 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -241,5 +241,8 @@ Scatter plot matrix from pandas.tools.plotting import scatter_matrix df = DataFrame(np.random.randn(1000, 4), columns=['a', 'b', 'c', 'd']) - @savefig scatter_matrix_ex.png width=6in - scatter_matrix(df, alpha=0.2, figsize=(8, 8)) + @savefig scatter_matrix_kde.png width=6in + scatter_matrix(df, alpha=0.2, figsize=(8, 8), diagonal='kde') + + @savefig scatter_matrix_hist.png width=6in + scatter_matrix(df, alpha=0.2, figsize=(8, 8), diagonal='hist') \ No newline at end of file diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index 85c47d46beb74..b930bdbbde1b1 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -16,6 +16,8 @@ What's New These are new features and improvements of note in each release. +.. include:: whatsnew/v0.8.0.txt + .. include:: whatsnew/v0.7.3.txt .. include:: whatsnew/v0.7.2.txt diff --git a/doc/source/whatsnew/v0.8.0.txt b/doc/source/whatsnew/v0.8.0.txt new file mode 100644 index 0000000000000..98f90d5254ed1 --- /dev/null +++ b/doc/source/whatsnew/v0.8.0.txt @@ -0,0 +1,4 @@ +.. _whatsnew_080: + +v.0.8.0 (TDB May, 2012) +----------------------- diff --git a/pandas/__init__.py b/pandas/__init__.py index 5451ee750d685..7ef0ba10c1aa0 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -34,6 +34,7 @@ from pandas.io.pytables import HDFStore from pandas.util.testing import debug -from pandas.tools.merge import merge, concat +from pandas.tools.merge import merge, concat, ordered_merge from pandas.tools.pivot import pivot_table, crosstab +from pandas.tools.plotting import scatter_matrix from pandas.tools.describe import value_range diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index e7126fd489e9d..d46a199a2baea 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -8,29 +8,45 @@ import pandas.core.common as com import pandas._tseries as lib -def match(values, index): +def match(to_match, values, na_sentinel=-1): """ - + Compute locations of to_match into values Parameters ---------- + to_match : array-like + values to find positions of + values : array-like + Unique set of values + na_sentinel : int, default -1 + Value to mark "not found" + + Examples + -------- Returns ------- - match : ndarray + match : ndarray of integers """ - f = lambda htype, caster: _match_generic(values, index, htype, caster) - return _hashtable_algo(f, index.dtype) + values = np.asarray(values) + if issubclass(values.dtype.type, basestring): + values = np.array(values, dtype='O') + + f = lambda htype, caster: _match_generic(to_match, values, htype, caster) + return _hashtable_algo(f, values.dtype) def unique(values): """ + Compute unique values (not necessarily sorted) efficiently from input array + of values Parameters ---------- + values : array-like Returns ------- - + uniques """ f = lambda htype, caster: _unique_generic(values, htype, caster) return _hashtable_algo(f, values.dtype) @@ -78,6 +94,7 @@ def _unique_generic(values, table_type, type_caster): uniques = table.unique(values) return uniques + def factorize(values, sort=False, order=None, na_sentinel=-1): """ Encode input values as an enumerated type or categorical variable @@ -91,6 +108,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1): Returns ------- """ + values = np.asarray(values) + is_datetime = com.is_datetime64_dtype(values) hash_klass, values = _get_data_algo(values, _hashtables) uniques = [] @@ -98,11 +117,11 @@ def factorize(values, sort=False, order=None, na_sentinel=-1): labels, counts = table.get_labels(values, uniques, 0, na_sentinel) labels = com._ensure_platform_int(labels) - + uniques = com._asarray_tuplesafe(uniques) if sort and len(counts) > 0: sorter = uniques.argsort() - reverse_indexer = np.empty(len(sorter), dtype=np.int32) + reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = labels < 0 @@ -112,6 +131,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1): uniques = uniques.take(sorter) counts = counts.take(sorter) + if is_datetime: + uniques = np.array(uniques, dtype='M8[ns]') + return labels, uniques, counts def value_counts(values, sort=True, ascending=False): @@ -162,6 +184,9 @@ def _get_data_algo(values, func_map): if com.is_float_dtype(values): f = func_map['float64'] values = com._ensure_float64(values) + elif com.is_datetime64_dtype(values): + f = func_map['int64'] + values = values.view('i8') elif com.is_integer_dtype(values): f = func_map['int64'] values = com._ensure_int64(values) diff --git a/pandas/core/api.py b/pandas/core/api.py index 41721c483a5b3..6a986f4842f43 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -3,6 +3,8 @@ import numpy as np +from pandas.core.algorithms import factorize, match, unique + from pandas.core.common import isnull, notnull, save, load from pandas.core.factor import Factor from pandas.core.format import set_printoptions diff --git a/pandas/core/common.py b/pandas/core/common.py index cad17087a7622..f8418788b7c40 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -18,6 +18,7 @@ def next(x): from numpy.lib.format import read_array, write_array import numpy as np +import pandas._algos as _algos import pandas._tseries as lib from pandas.util import py3compat import codecs @@ -66,7 +67,7 @@ def isnull(obj): if isinstance(obj, Series): result = Series(result, index=obj.index, copy=False) - elif obj.dtype == np.datetime64: + elif obj.dtype == np.dtype('M8[ns]'): # this is the NaT pattern result = np.array(obj).view('i8') == lib.NaT else: @@ -96,6 +97,37 @@ def notnull(obj): return not res return -res +def mask_missing(arr, values_to_mask): + """ + Return a masking array of same size/shape as arr + with entries equaling any member of values_to_mask set to True + """ + if np.isscalar(values_to_mask): + values_to_mask = [values_to_mask] + + try: + values_to_mask = np.array(values_to_mask, dtype=arr.dtype) + except Exception: + values_to_mask = np.array(values_to_mask, dtype=object) + + na_mask = isnull(values_to_mask) + nonna = values_to_mask[-na_mask] + + mask = None + for x in nonna: + if mask is None: + mask = arr == x + else: + mask = mask | (arr == x) + + if na_mask.any(): + if mask is None: + mask = isnull(arr) + else: + mask = mask | isnull(arr) + + return mask + def _pickle_array(arr): arr = arr.view(np.ndarray) @@ -111,17 +143,17 @@ def _unpickle_array(bytes): def _take_1d_datetime(arr, indexer, out, fill_value=np.nan): view = arr.view(np.int64) outview = out.view(np.int64) - lib.take_1d_bool(view, indexer, outview, fill_value=fill_value) + _algos.take_1d_bool(view, indexer, outview, fill_value=fill_value) def _take_2d_axis0_datetime(arr, indexer, out, fill_value=np.nan): view = arr.view(np.int64) outview = out.view(np.int64) - lib.take_1d_bool(view, indexer, outview, fill_value=fill_value) + _algos.take_1d_bool(view, indexer, outview, fill_value=fill_value) def _take_2d_axis1_datetime(arr, indexer, out, fill_value=np.nan): view = arr.view(np.uint8) outview = out.view(np.uint8) - lib.take_1d_bool(view, indexer, outview, fill_value=fill_value) + _algos.take_1d_bool(view, indexer, outview, fill_value=fill_value) def _view_wrapper(f, wrap_dtype, na_override=None): def wrapper(arr, indexer, out, fill_value=np.nan): @@ -134,42 +166,42 @@ def wrapper(arr, indexer, out, fill_value=np.nan): _take1d_dict = { - 'float64' : lib.take_1d_float64, - 'int32' : lib.take_1d_int32, - 'int64' : lib.take_1d_int64, - 'object' : lib.take_1d_object, - 'bool' : _view_wrapper(lib.take_1d_bool, np.uint8), - 'datetime64[us]' : _view_wrapper(lib.take_1d_int64, np.int64, + 'float64' : _algos.take_1d_float64, + 'int32' : _algos.take_1d_int32, + 'int64' : _algos.take_1d_int64, + 'object' : _algos.take_1d_object, + 'bool' : _view_wrapper(_algos.take_1d_bool, np.uint8), + 'datetime64[ns]' : _view_wrapper(_algos.take_1d_int64, np.int64, na_override=lib.NaT), } _take2d_axis0_dict = { - 'float64' : lib.take_2d_axis0_float64, - 'int32' : lib.take_2d_axis0_int32, - 'int64' : lib.take_2d_axis0_int64, - 'object' : lib.take_2d_axis0_object, - 'bool' : _view_wrapper(lib.take_2d_axis0_bool, np.uint8), - 'datetime64[us]' : _view_wrapper(lib.take_2d_axis0_int64, np.int64, + 'float64' : _algos.take_2d_axis0_float64, + 'int32' : _algos.take_2d_axis0_int32, + 'int64' : _algos.take_2d_axis0_int64, + 'object' : _algos.take_2d_axis0_object, + 'bool' : _view_wrapper(_algos.take_2d_axis0_bool, np.uint8), + 'datetime64[ns]' : _view_wrapper(_algos.take_2d_axis0_int64, np.int64, na_override=lib.NaT), } _take2d_axis1_dict = { - 'float64' : lib.take_2d_axis1_float64, - 'int32' : lib.take_2d_axis1_int32, - 'int64' : lib.take_2d_axis1_int64, - 'object' : lib.take_2d_axis1_object, - 'bool' : _view_wrapper(lib.take_2d_axis1_bool, np.uint8), - 'datetime64[us]' : _view_wrapper(lib.take_2d_axis1_int64, np.int64, + 'float64' : _algos.take_2d_axis1_float64, + 'int32' : _algos.take_2d_axis1_int32, + 'int64' : _algos.take_2d_axis1_int64, + 'object' : _algos.take_2d_axis1_object, + 'bool' : _view_wrapper(_algos.take_2d_axis1_bool, np.uint8), + 'datetime64[ns]' : _view_wrapper(_algos.take_2d_axis1_int64, np.int64, na_override=lib.NaT), } _take2d_multi_dict = { - 'float64' : lib.take_2d_multi_float64, - 'int32' : lib.take_2d_multi_int32, - 'int64' : lib.take_2d_multi_int64, - 'object' : lib.take_2d_multi_object, - 'bool' : _view_wrapper(lib.take_2d_multi_bool, np.uint8), - 'datetime64[us]' : _view_wrapper(lib.take_2d_multi_int64, np.int64, + 'float64' : _algos.take_2d_multi_float64, + 'int32' : _algos.take_2d_multi_int32, + 'int64' : _algos.take_2d_multi_int64, + 'object' : _algos.take_2d_multi_object, + 'bool' : _view_wrapper(_algos.take_2d_multi_bool, np.uint8), + 'datetime64[ns]' : _view_wrapper(_algos.take_2d_multi_int64, np.int64, na_override=lib.NaT), } @@ -214,7 +246,7 @@ def take_1d(arr, indexer, out=None, fill_value=np.nan): out.dtype) out = _maybe_upcast(out) np.putmask(out, mask, fill_value) - elif dtype_str in ('float64', 'object', 'datetime64[us]'): + elif dtype_str in ('float64', 'object', 'datetime64[ns]'): if out is None: out = np.empty(n, dtype=arr.dtype) take_f(arr, _ensure_int64(indexer), out=out, fill_value=fill_value) @@ -252,7 +284,7 @@ def take_2d_multi(arr, row_idx, col_idx, fill_value=np.nan): _ensure_int64(col_idx), out=out, fill_value=fill_value) return out - elif dtype_str in ('float64', 'object', 'datetime64[us]'): + elif dtype_str in ('float64', 'object', 'datetime64[ns]'): out = np.empty(out_shape, dtype=arr.dtype) take_f(arr, _ensure_int64(row_idx), _ensure_int64(col_idx), out=out, fill_value=fill_value) @@ -294,7 +326,7 @@ def take_2d(arr, indexer, out=None, mask=None, needs_masking=None, axis=0, take_f = _get_take2d_function(dtype_str, axis=axis) take_f(arr, _ensure_int64(indexer), out=out, fill_value=fill_value) return out - elif dtype_str in ('float64', 'object', 'datetime64[us]'): + elif dtype_str in ('float64', 'object', 'datetime64[ns]'): if out is None: out = np.empty(out_shape, dtype=arr.dtype) take_f = _get_take2d_function(dtype_str, axis=axis) @@ -366,59 +398,73 @@ def wrapper(arr, mask, limit=None): f(view, mask, limit=limit) return wrapper -_pad_1d_datetime = _interp_wrapper(lib.pad_inplace_int64, np.int64) -_pad_2d_datetime = _interp_wrapper(lib.pad_2d_inplace_int64, np.int64) -_backfill_1d_datetime = _interp_wrapper(lib.backfill_inplace_int64, np.int64) -_backfill_2d_datetime = _interp_wrapper(lib.backfill_2d_inplace_int64, np.int64) +_pad_1d_datetime = _interp_wrapper(_algos.pad_inplace_int64, np.int64) +_pad_2d_datetime = _interp_wrapper(_algos.pad_2d_inplace_int64, np.int64) +_backfill_1d_datetime = _interp_wrapper(_algos.backfill_inplace_int64, np.int64) +_backfill_2d_datetime = _interp_wrapper(_algos.backfill_2d_inplace_int64, np.int64) -def pad_1d(values, limit=None): +def pad_1d(values, limit=None, mask=None): if is_float_dtype(values): - _method = lib.pad_inplace_float64 + _method = _algos.pad_inplace_float64 elif is_datetime64_dtype(values): _method = _pad_1d_datetime elif values.dtype == np.object_: - _method = lib.pad_inplace_object + _method = _algos.pad_inplace_object else: # pragma: no cover raise ValueError('Invalid dtype for padding') - _method(values, isnull(values).view(np.uint8), limit=limit) + if mask is None: + mask = isnull(values) + mask = mask.view(np.uint8) + _method(values, mask, limit=limit) -def backfill_1d(values, limit=None): +def backfill_1d(values, limit=None, mask=None): if is_float_dtype(values): - _method = lib.backfill_inplace_float64 + _method = _algos.backfill_inplace_float64 elif is_datetime64_dtype(values): _method = _backfill_1d_datetime elif values.dtype == np.object_: - _method = lib.backfill_inplace_object + _method = _algos.backfill_inplace_object else: # pragma: no cover raise ValueError('Invalid dtype for padding') - _method(values, isnull(values).view(np.uint8), limit=limit) + if mask is None: + mask = isnull(values) + mask = mask.view(np.uint8) + + _method(values, mask, limit=limit) -def pad_2d(values, limit=None): +def pad_2d(values, limit=None, mask=None): if is_float_dtype(values): - _method = lib.pad_2d_inplace_float64 + _method = _algos.pad_2d_inplace_float64 elif is_datetime64_dtype(values): _method = _pad_2d_datetime elif values.dtype == np.object_: - _method = lib.pad_2d_inplace_object + _method = _algos.pad_2d_inplace_object else: # pragma: no cover raise ValueError('Invalid dtype for padding') - _method(values, isnull(values).view(np.uint8), limit=limit) + if mask is None: + mask = isnull(values) + mask = mask.view(np.uint8) -def backfill_2d(values, limit=None): + _method(values, mask, limit=limit) + +def backfill_2d(values, limit=None, mask=None): if is_float_dtype(values): - _method = lib.backfill_2d_inplace_float64 + _method = _algos.backfill_2d_inplace_float64 elif is_datetime64_dtype(values): _method = _backfill_2d_datetime elif values.dtype == np.object_: - _method = lib.backfill_2d_inplace_object + _method = _algos.backfill_2d_inplace_object else: # pragma: no cover raise ValueError('Invalid dtype for padding') - _method(values, isnull(values).view(np.uint8), limit=limit) + if mask is None: + mask = isnull(values) + mask = mask.view(np.uint8) + _method(values, mask, limit=limit) def _consensus_name_attr(objs): name = objs[0].name @@ -710,36 +756,12 @@ def is_float_dtype(arr_or_dtype): return issubclass(tipo, np.floating) -def _ensure_float64(arr): - if arr.dtype != np.float64: - arr = arr.astype(np.float64) - return arr +_ensure_float64 = _algos.ensure_float64 +_ensure_int64 = _algos.ensure_int64 +_ensure_int32 = _algos.ensure_int32 +_ensure_platform_int = _algos.ensure_platform_int +_ensure_object = _algos.ensure_object -def _ensure_int64(arr): - try: - if arr.dtype != np.int64: - arr = arr.astype(np.int64) - return arr - except AttributeError: - return np.array(arr, dtype=np.int64) - -def _ensure_platform_int(labels): - try: - if labels.dtype != np.int_: # pragma: no cover - labels = labels.astype(np.int_) - return labels - except AttributeError: - return np.array(labels, dtype=np.int_) - -def _ensure_int32(arr): - if arr.dtype != np.int32: - arr = arr.astype(np.int32) - return arr - -def _ensure_object(arr): - if arr.dtype != np.object_: - arr = arr.astype('O') - return arr def _astype_nansafe(arr, dtype): if (np.issubdtype(arr.dtype, np.floating) and diff --git a/pandas/core/factor.py b/pandas/core/factor.py index 650ff033f79c9..6bc45924a08f2 100644 --- a/pandas/core/factor.py +++ b/pandas/core/factor.py @@ -18,11 +18,17 @@ class Factor(np.ndarray): * levels : ndarray """ def __new__(cls, data): - data = np.asarray(data, dtype=object) - levels, factor = unique_with_labels(data) - factor = factor.view(Factor) - factor.levels = levels - return factor + from pandas.core.index import _ensure_index + from pandas.core.algorithms import factorize + + try: + labels, levels, _ = factorize(data, sort=True) + except TypeError: + labels, levels, _ = factorize(data, sort=False) + + labels = labels.view(Factor) + labels.levels = _ensure_index(levels) + return labels levels = None @@ -51,21 +57,3 @@ def __getitem__(self, key): else: return np.ndarray.__getitem__(self, key) - -def unique_with_labels(values): - from pandas.core.index import Index - rizer = lib.Factorizer(len(values)) - labels, _ = rizer.factorize(values, sort=False) - uniques = Index(rizer.uniques) - labels = com._ensure_platform_int(labels) - try: - sorter = uniques.argsort() - reverse_indexer = np.empty(len(sorter), dtype=np.int_) - reverse_indexer.put(sorter, np.arange(len(sorter))) - labels = reverse_indexer.take(labels) - uniques = uniques.take(sorter) - except TypeError: - pass - - return uniques, labels - diff --git a/pandas/core/format.py b/pandas/core/format.py index 6ae204b944d3a..c22e2df221831 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -571,16 +571,30 @@ def get_result(self): if self.formatter: formatter = self.formatter else: - def formatter(x): - if isnull(x): - return 'NaT' - else: - return str(x) + formatter = _format_datetime64 fmt_values = [formatter(x) for x in self.values] - return _make_fixed_width(fmt_values, self.justify) +def _format_datetime64(x): + if isnull(x): + return 'NaT' + + stamp = lib.Timestamp(x) + base = stamp.strftime('%Y-%m-%d %H:%M:%S') + + fraction = stamp.microsecond * 1000 + stamp.nanosecond + digits = 9 + + if fraction == 0: + return base + + while (fraction % 10) == 0: + fraction /= 10 + digits -= 1 + + return base + ('.%%.%id' % digits) % fraction + def _make_fixed_width(strings, justify='right'): if len(strings) == 0: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9b9e0c62d4730..dc48baec85a00 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -32,7 +32,6 @@ from pandas.core.internals import BlockManager, make_block, form_blocks from pandas.core.series import Series, _radd_compat from pandas.compat.scipy import scoreatpercentile as _quantile -from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex from pandas.util import py3compat from pandas.util.terminal import get_terminal_size @@ -153,7 +152,7 @@ 3 foo 4 3 bar 8 >>> merge(A, B, left_on='lkey', right_on='rkey', how='outer') - lkey value.x rkey value.y + lkey value_x rkey value_y 0 bar 2 bar 6 1 bar 2 bar 8 2 baz 3 NaN NaN @@ -305,7 +304,11 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, elif isinstance(data, ma.MaskedArray): mask = ma.getmaskarray(data) datacopy = ma.copy(data) - datacopy[mask] = np.nan + if issubclass(data.dtype.type, np.datetime64): + datacopy[mask] = lib.NaT + else: + datacopy = com._maybe_upcast(datacopy) + datacopy[mask] = np.nan mgr = self._init_ndarray(datacopy, index, columns, dtype=dtype, copy=copy) elif isinstance(data, np.ndarray): @@ -527,6 +530,9 @@ def __iter__(self): """ return iter(self.columns) + def keys(self): + return self.columns + def iteritems(self): """Iterator over (column, series) pairs""" return ((k, self[k]) for k in self.columns) @@ -679,6 +685,93 @@ def to_dict(self): """ return dict((k, v.to_dict()) for k, v in self.iteritems()) + @classmethod + def from_json(cls, json, orient="columns", dtype=None, numpy=True): + """ + Convert JSON string to DataFrame + + Parameters + ---------- + json : The JSON string to parse. + orient : {'split', 'records', 'index', 'columns', 'values'}, + default 'columns' + The format of the JSON string + split : dict like + {index -> [index], columns -> [columns], data -> [values]} + records : list like [{column -> value}, ... , {column -> value}] + index : dict like {index -> {column -> value}} + columns : dict like {column -> {index -> value}} + values : just the values array + dtype : dtype of the resulting DataFrame + nupmpy: direct decoding to numpy arrays. default True but falls back + to standard decoding if a problem occurs. + + Returns + ------- + result : DataFrame + """ + from pandas._ujson import loads + df = None + + if numpy: + try: + if orient == "columns": + args = loads(json, dtype=dtype, numpy=True, labelled=True) + if args: + args = (args[0].T, args[2], args[1]) + df = DataFrame(*args) + elif orient == "split": + df = DataFrame(**loads(json, dtype=dtype, numpy=True)) + elif orient == "values": + df = DataFrame(loads(json, dtype=dtype, numpy=True)) + else: + df = DataFrame(*loads(json, dtype=dtype, numpy=True, + labelled=True)) + except ValueError: + numpy = False + if not numpy: + if orient == "columns": + df = DataFrame(loads(json), dtype=dtype) + elif orient == "split": + df = DataFrame(dtype=dtype, **loads(json)) + elif orient == "index": + df = DataFrame(loads(json), dtype=dtype).T + else: + df = DataFrame(loads(json), dtype=dtype) + + return df + + def to_json(self, orient="columns", double_precision=10, + force_ascii=True): + """ + Convert DataFrame to a JSON string. + + Note NaN's and None will be converted to null and datetime objects + will be converted to UNIX timestamps. + + Parameters + ---------- + orient : {'split', 'records', 'index', 'columns', 'values'}, + default 'columns' + The format of the JSON string + split : dict like + {index -> [index], columns -> [columns], data -> [values]} + records : list like [{column -> value}, ... , {column -> value}] + index : dict like {index -> {column -> value}} + columns : dict like {column -> {index -> value}} + values : just the values array + double_precision : The number of decimal places to use when encoding + floating point values, default 10. + force_ascii : force encoded string to be ASCII, default True. + + Returns + ------- + result : JSON compatible string + """ + from pandas._ujson import dumps + return dumps(self, orient=orient, double_precision=double_precision, + ensure_ascii=force_ascii) + @classmethod def from_records(cls, data, index=None, exclude=None, columns=None, names=None, coerce_float=False): @@ -2338,7 +2431,7 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None): new_labels = labels[mask] return self.reindex(**{axis_name: new_labels}) - def drop_duplicates(self, cols=None, take_last=False): + def drop_duplicates(self, cols=None, take_last=False, inplace=False): """ Return DataFrame with duplicate rows removed, optionally only considering certain columns @@ -2350,13 +2443,25 @@ def drop_duplicates(self, cols=None, take_last=False): default use all of the columns take_last : boolean, default False Take the last observed row in a row. Defaults to the first row + skipna : boolean, default True + If True then keep NaN + inplace : boolean, default False + Whether to drop duplicates in place or to return a copy Returns ------- deduplicated : DataFrame """ + duplicated = self.duplicated(cols, take_last=take_last) - return self[-duplicated] + + if inplace: + inds, = (-duplicated).nonzero() + self._data = self._data.take(inds) + self._clear_item_cache() + return self + else: + return self[-duplicated] def duplicated(self, cols=None, take_last=False): """ @@ -2377,11 +2482,13 @@ def duplicated(self, cols=None, take_last=False): """ if cols is not None: if isinstance(cols, list): - keys = zip(*[self[x] for x in cols]) + values = [self[x].values for x in cols] + keys = lib.fast_zip_fillna(values) else: - keys = list(self[cols]) + keys = lib.fast_zip_fillna([self[cols]]) else: - keys = zip(*self.values.T) + values = list(self.values.T) + keys = lib.fast_zip_fillna(values) duplicated = lib.duplicated(keys, take_last=take_last) return Series(duplicated, index=self.index) @@ -2590,17 +2697,15 @@ def fillna(self, value=None, method='pad', axis=0, inplace=False, # Float type values if len(self.columns) == 0: return self - if np.isscalar(value): - new_data = self._data.fillna(value, inplace=inplace) - elif isinstance(value, dict): + if isinstance(value, dict): result = self if inplace else self.copy() for k, v in value.iteritems(): if k not in result: continue result[k].fillna(v, inplace=True) return result - else: # pragma: no cover - raise TypeError('Invalid fill value type: %s' % type(value)) + else: + new_data = self._data.fillna(value, inplace=inplace) if inplace: self._data = new_data @@ -2608,6 +2713,154 @@ def fillna(self, value=None, method='pad', axis=0, inplace=False, else: return self._constructor(new_data) + def replace(self, to_replace, value=None, method='pad', axis=0, + inplace=False, limit=None): + """ + Replace values given in 'to_replace' with 'value' or using 'method' + + Parameters + ---------- + value : scalar or dict, default None + Value to use to fill holes (e.g. 0), alternately a dict of values + specifying which value to use for each column (columns not in the + dict will not be filled) + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default 'pad' + Method to use for filling holes in reindexed Series + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + axis : {0, 1}, default 0 + 0: fill column-by-column + 1: fill row-by-row + inplace : boolean, default False + If True, fill the DataFrame in place. Note: this will modify any + other views on this DataFrame, like if you took a no-copy slice of + an existing DataFrame, for example a column in a DataFrame. Returns + a reference to the filled object, which is self if inplace=True + limit : int, default None + Maximum size gap to forward or backward fill + + See also + -------- + reindex, asfreq + + Returns + ------- + filled : DataFrame + """ + self._consolidate_inplace() + + if value is None: + return self._interpolate(to_replace, method, axis, inplace, limit) + else: + # Float type values + if len(self.columns) == 0: + return self + + if np.isscalar(to_replace): + + if np.isscalar(value): # np.nan -> 0 + new_data = self._data.replace(to_replace, value, + inplace=inplace) + if inplace: + self._data = new_data + return self + else: + return self._constructor(new_data) + + elif isinstance(value, dict): # np.nan -> {'A' : 0, 'B' : -1} + return self._replace_dest_dict(to_replace, value, inplace) + + + elif isinstance(to_replace, dict): + + if np.isscalar(value): # {'A' : np.nan, 'B' : ''} -> 0 + return self._replace_src_dict(to_replace, value, inplace) + + elif isinstance(value, dict): # {'A' : np.nan} -> {'A' : 0} + return self._replace_both_dict(to_replace, value, inplace) + + raise ValueError('Fill value must be scalar or dict') + + + elif isinstance(to_replace, (list, np.ndarray)): + # [np.nan, ''] -> [0, 'missing'] + if isinstance(value, (list, np.ndarray)): + if len(to_replace) != len(value): + raise ValueError('Replacement lists must match ' + 'in length. Expecting %d got %d ' % + (len(to_replace), len(value))) + + new_data = self._data if inplace else self.copy()._data + for s, d in zip(to_replace, value): + new_data = new_data.replace(s, d, inplace=True) + + else: # [np.nan, ''] -> 0 + new_data = self._data.replace(to_replace, value, + inplace=inplace) + + if inplace: + self._data = new_data + return self + else: + return self._constructor(new_data) + + raise ValueError('Invalid to_replace type: %s' % type(to_replace)) + + def _interpolate(self, to_replace, method, axis, inplace, limit): + if self._is_mixed_type and axis == 1: + return self.T.replace(to_replace, method=method, limit=limit).T + + method = com._clean_fill_method(method) + + if isinstance(to_replace, dict): + if axis == 1: + return self.T.replace(to_replace, method=method, + limit=limit).T + + rs = self if inplace else self.copy() + for k, v in to_replace.iteritems(): + if k in rs: + rs[k].replace(v, method=method, limit=limit, + inplace=True) + return rs + + else: + new_blocks = [] + for block in self._data.blocks: + newb = block.interpolate(method, axis=axis, + limit=limit, inplace=inplace, + missing=to_replace) + new_blocks.append(newb) + new_data = BlockManager(new_blocks, self._data.axes) + + if inplace: + self._data = new_data + return self + else: + return self._constructor(new_data) + + + def _replace_dest_dict(self, to_replace, value, inplace): + rs = self if inplace else self.copy() + for k, v in value.iteritems(): + if k in rs: + rs[k].replace(to_replace, v, inplace=True) + return rs + + def _replace_src_dict(self, to_replace, value, inplace): + rs = self if inplace else self.copy() + for k, src in to_replace.iteritems(): + if k in rs: + rs[k].replace(src, value, inplace=True) + return rs + + def _replace_both_dict(self, to_replace, value, inplace): + rs = self if inplace else self.copy() + for c, src in to_replace.iteritems(): + if c in value and c in rs: + rs[c].replace(src, value[c], inplace=True) + return rs + #---------------------------------------------------------------------- # Rename @@ -3423,7 +3676,7 @@ def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', @Appender(_merge_doc, indents=2) def merge(self, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=True, - suffixes=('.x', '.y'), copy=True): + suffixes=('_x', '_y'), copy=True): from pandas.tools.merge import merge return merge(self, right, how=how, on=on, left_on=left_on, right_on=right_on, @@ -4375,7 +4628,6 @@ def _homogenize(data, index, columns, dtype=None): def _put_str(s, space): return ('%s' % s)[:space].ljust(space) - def _is_sequence(x): try: iter(x) @@ -4384,7 +4636,6 @@ def _is_sequence(x): except Exception: return False - def install_ipython_completers(): # pragma: no cover """Register the DataFrame type with IPython's tab completion machinery, so that it knows about accessing column names as attributes.""" diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5bd41423c9a2f..1ce05f852dedd 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -155,9 +155,9 @@ def asfreq(self, freq, method=None, how=None): from pandas.tseries.resample import asfreq return asfreq(self, freq, method=method, how=how) - def resample(self, rule, how='mean', axis=0, - fill_method=None, closed='right', label='right', - convention=None, kind=None, loffset=None, limit=None): + def resample(self, rule, how='mean', axis=0, fill_method=None, + closed='right', label='right', convention=None, + kind=None, loffset=None, limit=None, base=0): """ Convenience method for frequency conversion and resampling of regular time-series data. @@ -175,12 +175,16 @@ def resample(self, rule, how='mean', axis=0, convention : {'start', 'end', 's', 'e'} loffset : timedelta Adjust the resampled time labels + base : int, default 0 + For frequencies that evenly subdivide 1 day, the "origin" of the + aggregated intervals. For example, for '5min' frequency, base could + range from 0 through 4. Defaults to 0 """ from pandas.tseries.resample import TimeGrouper sampler = TimeGrouper(rule, label=label, closed=closed, how=how, axis=axis, kind=kind, loffset=loffset, fill_method=fill_method, convention=convention, - limit=limit) + limit=limit, base=base) return sampler.resample(self) def first(self, offset): @@ -211,7 +215,7 @@ def first(self, offset): end_date = end = self.index[0] + offset # Tick-like, e.g. 3 weeks - if not offset.isAnchored() and hasattr(offset, 'delta'): + if not offset.isAnchored() and hasattr(offset, '_inc'): if end_date in self.index: end = self.index.searchsorted(end_date, side='left') diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 4b4d7a8581f65..6d5ae2a573482 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -30,6 +30,20 @@ def f(self): return f +def _first_compat(x, axis=0): + x = np.asarray(x) + x = x[com.notnull(x)] + if len(x) == 0: + return np.nan + return x[0] + +def _last_compat(x, axis=0): + x = np.asarray(x) + x = x[com.notnull(x)] + if len(x) == 0: + return np.nan + return x[-1] + class GroupBy(object): """ @@ -99,9 +113,9 @@ class GroupBy(object): """ def __init__(self, obj, keys=None, axis=0, level=None, - grouper=None, exclusions=None, column=None, as_index=True, + grouper=None, exclusions=None, selection=None, as_index=True, sort=True, group_keys=True): - self._column = column + self._selection = selection if isinstance(obj, NDFrame): obj._consolidate_inplace() @@ -145,10 +159,16 @@ def indices(self): @property def name(self): - if self._column is None: + if self._selection is None: return None # 'result' else: - return self._column + return self._selection + + @property + def _selection_list(self): + if not isinstance(self._selection, (list, tuple, np.ndarray)): + return [self._selection] + return self._selection @property def _obj_with_exclusions(self): @@ -314,6 +334,8 @@ def size(self): prod = _groupby_function('prod', 'prod', np.prod) min = _groupby_function('min', 'min', np.min) max = _groupby_function('max', 'max', np.max) + first = _groupby_function('first', 'first', _first_compat) + last = _groupby_function('last', 'last', _last_compat) def ohlc(self): """ @@ -323,11 +345,11 @@ def ohlc(self): """ return self._cython_agg_general('ohlc') - def last(self): - return self.nth(-1) + # def last(self): + # return self.nth(-1) - def first(self): - return self.nth(0) + # def first(self): + # return self.nth(0) def nth(self, n): def picker(arr): @@ -520,10 +542,9 @@ def indices(self): if len(self.groupings) == 1: return self.groupings[0].indices else: - # TODO: this is massively inefficient - to_groupby = zip(*(ping.grouper for ping in self.groupings)) - to_groupby = Index(to_groupby) - return lib.groupby_indices(to_groupby) + label_list = [ping.labels for ping in self.groupings] + keys = [ping.group_index for ping in self.groupings] + return _get_indices_dict(label_list, keys) @property def labels(self): @@ -621,7 +642,9 @@ def get_group_levels(self): 'max' : lib.group_max, 'mean' : lib.group_mean, 'var' : lib.group_var, - 'std' : lib.group_var + 'std' : lib.group_var, + 'first': lambda a, b, c, d: lib.group_nth(a, b, c, d, 1), + 'last': lib.group_last } _cython_transforms = { @@ -858,7 +881,9 @@ def names(self): 'max' : lib.group_max_bin, 'var' : lib.group_var_bin, 'std' : lib.group_var_bin, - 'ohlc' : lib.group_ohlc + 'ohlc' : lib.group_ohlc, + 'first': lambda a, b, c, d: lib.group_nth_bin(a, b, c, d, 1), + 'last': lib.group_last_bin } _name_functions = { @@ -1182,15 +1207,23 @@ def aggregate(self, func_or_funcs, *args, **kwargs): return ret def _aggregate_multiple_funcs(self, arg): - if not isinstance(arg, dict): - arg = dict((func.__name__, func) for func in arg) + if isinstance(arg, dict): + columns = arg.keys() + arg = arg.items() + elif isinstance(arg[0], (tuple, list)): + # indicated column order + columns = list(zip(*arg))[0] + else: + # list of functions + columns = [func.__name__ for func in arg] + arg = zip(columns, arg) results = {} - for name, func in arg.iteritems(): + for name, func in arg: results[name] = self.aggregate(func) - return DataFrame(results) + return DataFrame(results, columns=columns) def _wrap_aggregated_output(self, output, names=None): # sort of a kludge @@ -1213,7 +1246,11 @@ def _get_index(): index = Index(keys, name=self.grouper.names[0]) return index - if isinstance(values[0], Series): + if isinstance(values[0], dict): + # # GH #823 + return DataFrame(values, index=keys).stack() + + if isinstance(values[0], (Series, dict)): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) elif isinstance(values[0], DataFrame): @@ -1268,10 +1305,10 @@ class NDFrameGroupBy(GroupBy): def _iterate_slices(self): if self.axis == 0: # kludge - if self._column is None: + if self._selection is None: slice_axis = self.obj.columns else: - slice_axis = [self._column] + slice_axis = self._selection_list slicer = lambda x: self.obj[x] else: slice_axis = self.obj.index @@ -1335,8 +1372,8 @@ def _post_process_cython_aggregate(self, obj): @cache_readonly def _obj_with_exclusions(self): - if self._column is not None: - return self.obj.reindex(columns=[self._column]) + if self._selection is not None: + return self.obj.reindex(columns=self._selection_list) if len(self.exclusions) > 0: return self.obj.drop(self.exclusions, axis=1) @@ -1368,19 +1405,38 @@ def aggregate(self, arg, *args, **kwargs): obj = self._obj_with_exclusions - if self._column is not None: - series_obj = obj[self._column] - for fname, func in arg.iteritems(): - colg = SeriesGroupBy(series_obj, column=self._column, + if any(isinstance(x, (list, tuple, dict)) for x in arg.values()): + new_arg = {} + for k, v in arg.iteritems(): + if not isinstance(v, (tuple, list, dict)): + new_arg[k] = [v] + else: + new_arg[k] = v + arg = new_arg + + keys = [] + if self._selection is not None: + subset = obj[self._selection] + if isinstance(subset, DataFrame): + raise NotImplementedError + + for fname, agg_how in arg.iteritems(): + colg = SeriesGroupBy(subset, selection=self._selection, grouper=self.grouper) - result[fname] = colg.aggregate(func) + result[fname] = colg.aggregate(agg_how) + keys.append(fname) else: - for col, func in arg.iteritems(): - colg = SeriesGroupBy(obj[col], column=col, + for col, agg_how in arg.iteritems(): + colg = SeriesGroupBy(obj[col], selection=col, grouper=self.grouper) - result[col] = colg.aggregate(func) + result[col] = colg.aggregate(agg_how) + keys.append(col) - result = DataFrame(result) + if isinstance(result.values()[0], DataFrame): + from pandas.tools.merge import concat + result = concat([result[k] for k in keys], keys=keys, axis=1) + else: + result = DataFrame(result) elif isinstance(arg, list): return self._aggregate_multiple_funcs(arg) else: @@ -1420,7 +1476,7 @@ def _aggregate_multiple_funcs(self, arg): keys = [] for col in obj: try: - colg = SeriesGroupBy(obj[col], column=col, + colg = SeriesGroupBy(obj[col], selection=col, grouper=self.grouper) results.append(colg.aggregate(arg)) keys.append(col) @@ -1467,7 +1523,7 @@ def _aggregate_item_by_item(self, func, *args, **kwargs): cannot_agg = [] for item in obj: try: - colg = SeriesGroupBy(obj[item], column=item, + colg = SeriesGroupBy(obj[item], selection=item, grouper=self.grouper) result[item] = colg.aggregate(func, *args, **kwargs) except (ValueError, TypeError): @@ -1597,22 +1653,21 @@ class DataFrameGroupBy(NDFrameGroupBy): _block_agg_axis = 1 def __getitem__(self, key): - if self._column is not None: - raise Exception('Column %s already selected' % self._column) + if self._selection is not None: + raise Exception('Column(s) %s already selected' % self._selection) - if key not in self.obj: # pragma: no cover - raise KeyError(str(key)) - - # kind of a kludge - if self.as_index: - return SeriesGroupBy(self.obj[key], column=key, - grouper=self.grouper, - exclusions=self.exclusions) - else: - return DataFrameGroupBy(self.obj, self.grouper, column=key, + if isinstance(key, (list, tuple)) or not self.as_index: + return DataFrameGroupBy(self.obj, self.grouper, selection=key, grouper=self.grouper, exclusions=self.exclusions, as_index=self.as_index) + else: + if key not in self.obj: # pragma: no cover + raise KeyError(str(key)) + # kind of a kludge + return SeriesGroupBy(self.obj[key], selection=key, + grouper=self.grouper, + exclusions=self.exclusions) def _wrap_generic_output(self, result, obj): result_index = self.grouper.levels[0] @@ -1710,14 +1765,15 @@ class PanelGroupBy(NDFrameGroupBy): def _iterate_slices(self): if self.axis == 0: # kludge - if self._column is None: + if self._selection is None: slice_axis = self.obj.items else: - slice_axis = [self._column] + slice_axis = self._selection_list slicer = lambda x: self.obj[x] - elif foo: - slice_axis = self.obj.index - slicer = lambda x: self.obj.xs(x, axis=self.axis) + else: + raise NotImplementedError + # slice_axis = self.obj.index + # slicer = lambda x: self.obj.xs(x, axis=self.axis) for val in slice_axis: if val in self.exclusions: @@ -1948,6 +2004,21 @@ def get_key(self, comp_id): return tuple(level[table.get_item(comp_id)] for table, level in zip(self.tables, self.levels)) + +def _get_indices_dict(label_list, keys): + shape = [len(x) for x in keys] + group_index = get_group_index(label_list, shape) + + sorter, _ = lib.groupsort_indexer(com._ensure_int64(group_index), + np.prod(shape)) + + sorter_int = com._ensure_platform_int(sorter) + + sorted_labels = [lab.take(sorter_int) for lab in label_list] + group_index = group_index.take(sorter_int) + + return lib.indices_fast(sorter, group_index, keys, sorted_labels) + #---------------------------------------------------------------------- # sorting levels...cleverly? @@ -2015,9 +2086,7 @@ def _intercept_cython(func): return _cython_table.get(func) def _groupby_indices(values): - if values.dtype != np.object_: - values = values.astype('O') - return lib.groupby_indices(values) + return lib.groupby_indices(com._ensure_object(values)) def numpy_groupby(data, labels, axis=0): s = np.argsort(labels) diff --git a/pandas/core/index.py b/pandas/core/index.py index d0b9ef4fbde13..0b10fbbbd9a89 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -12,6 +12,7 @@ from pandas.util import py3compat import pandas.core.common as com import pandas._tseries as lib +import pandas._algos as _algos __all__ = ['Index'] @@ -56,11 +57,11 @@ class Index(np.ndarray): _join_precedence = 1 # Cython methods - _groupby = lib.groupby_object - _arrmap = lib.arrmap_object - _left_indexer = lib.left_join_indexer_object - _inner_indexer = lib.inner_join_indexer_object - _outer_indexer = lib.outer_join_indexer_object + _groupby = _algos.groupby_object + _arrmap = _algos.arrmap_object + _left_indexer = _algos.left_join_indexer_object + _inner_indexer = _algos.inner_join_indexer_object + _outer_indexer = _algos.outer_join_indexer_object _box_scalars = False @@ -690,8 +691,8 @@ def get_indexer(self, target, method=None, limit=None): return pself.get_indexer(ptarget, method=method, limit=limit) if self.dtype != target.dtype: - this = Index(self, dtype=object) - target = Index(target, dtype=object) + this = self.astype(object) + target = target.astype(object) return this.get_indexer(target, method=method, limit=limit) if not self.is_unique: @@ -1067,11 +1068,11 @@ def copy(self, order='C'): class Int64Index(Index): - _groupby = lib.groupby_int64 - _arrmap = lib.arrmap_int64 - _left_indexer = lib.left_join_indexer_int64 - _inner_indexer = lib.inner_join_indexer_int64 - _outer_indexer = lib.outer_join_indexer_int64 + _groupby = _algos.groupby_int64 + _arrmap = _algos.arrmap_int64 + _left_indexer = _algos.left_join_indexer_int64 + _inner_indexer = _algos.inner_join_indexer_int64 + _outer_indexer = _algos.outer_join_indexer_int64 _engine_type = lib.Int64Engine @@ -1171,8 +1172,12 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None): levels = [_ensure_index(lev) for lev in levels] labels = [np.asarray(labs, dtype=np.int_) for labs in labels] - values = [ndtake(np.asarray(lev), lab) + values = [ndtake(lev.values, lab) for lev, lab in zip(levels, labels)] + + # Need to box timestamps, etc. + values = _clean_arrays(values) + subarr = lib.fast_zip(values).view(cls) subarr.levels = levels @@ -1378,7 +1383,7 @@ def lexsort_depth(self): return self.nlevels else: return 0 - + int64_labels = [com._ensure_int64(lab) for lab in self.labels] for k in range(self.nlevels, 0, -1): if lib.is_lexsorted(int64_labels[:k]): @@ -2371,3 +2376,13 @@ def _maybe_box_dtindex(idx): return Index(_dt_box_array(idx.asi8), dtype='object') return idx +def _clean_arrays(values): + result = [] + for arr in values: + if np.issubdtype(arr.dtype, np.datetime_): + result.append(lib.map_infer(arr, lib.Timestamp)) + else: + result.append(arr) + return result + + diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 5aa46e41d4c71..c2fb8d820bf8e 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -171,7 +171,7 @@ def _getitem_lowerdim(self, tup): except Exception: if isinstance(tup[0], slice): raise IndexingError - if tup[0] not in ax0: + if tup[0] not in ax0: # and tup[0] not in ax0.levels[0]: raise # to avoid wasted computation diff --git a/pandas/core/internals.py b/pandas/core/internals.py index fde2ac81e56de..c4e4d810f4e0c 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -209,15 +209,51 @@ def split_block_at(self, item): def fillna(self, value, inplace=False): new_values = self.values if inplace else self.values.copy() - mask = com.isnull(new_values.ravel()) - new_values.flat[mask] = value + + mask = com.isnull(new_values) + np.putmask(new_values, mask, value) + + if inplace: + return self + else: + return make_block(new_values, self.items, self.ref_items) + + def _can_hold_element(self, value): + raise NotImplementedError() + + def _try_cast(self, value): + raise NotImplementedError() + + def replace(self, to_replace, value, inplace=False): + new_values = self.values if inplace else self.values.copy() + if self._can_hold_element(value): + value = self._try_cast(value) + + if np.isscalar(to_replace): + if self._can_hold_element(to_replace): + to_replace = self._try_cast(to_replace) + np.putmask(new_values, com.mask_missing(new_values, to_replace), + value) + else: + try: + to_replace = np.array(to_replace, dtype=self.dtype) + np.putmask(new_values, com.mask_missing(new_values, to_replace), + value) + except: + to_replace = np.array(to_replace, dtype=object) + for r in to_replace: + if self._can_hold_element(r): + r = self._try_cast(r) + np.putmask(new_values, com.mask_missing(new_values, to_replace), + value) if inplace: return self else: return make_block(new_values, self.items, self.ref_items) - def interpolate(self, method='pad', axis=0, inplace=False, limit=None): + def interpolate(self, method='pad', axis=0, inplace=False, + limit=None, missing=None): values = self.values if inplace else self.values.copy() if values.ndim != 2: @@ -225,10 +261,15 @@ def interpolate(self, method='pad', axis=0, inplace=False, limit=None): transf = (lambda x: x) if axis == 0 else (lambda x: x.T) + if missing is None: + mask = None + else: # todo create faster fill func without masking + mask = _mask_missing(transf(values), missing) + if method == 'pad': - com.pad_2d(transf(values), limit=limit) + com.pad_2d(transf(values), limit=limit, mask=mask) else: - com.backfill_2d(transf(values), limit=limit) + com.backfill_2d(transf(values), limit=limit, mask=mask) return make_block(values, self.items, self.ref_items) @@ -239,35 +280,91 @@ def take(self, indexer, axis=1, fill_value=np.nan): fill_value=fill_value) return make_block(new_values, self.items, self.ref_items) +def _mask_missing(array, missing_values): + if np.isscalar(missing_values): + missing_values = [missing_values] + + missing_values = np.array(missing_values, dtype=object) + if com.isnull(missing_values).any(): + mask = com.isnull(array) + missing_values = missing_values[com.notnull(missing_values)] + + for v in missing_values: + if mask is None: + mask = array == missing_values + else: + mask |= array == missing_values + return mask + #------------------------------------------------------------------------------- # Is this even possible? class FloatBlock(Block): _can_hold_na = True + def _can_hold_element(self, element): + return isinstance(element, (float, int)) + + def _try_cast(self, element): + try: + return float(element) + except: + return element + def should_store(self, value): # when inserting a column should not coerce integers to floats # unnecessarily return issubclass(value.dtype.type, np.floating) +class ComplexBlock(Block): + _can_hold_na = True + + def should_store(self, value): + return issubclass(value.dtype.type, np.complexfloating) + class IntBlock(Block): _can_hold_na = False + def _can_hold_element(self, element): + return isinstance(element, int) + + def _try_cast(self, element): + try: + return int(element) + except: + return element + def should_store(self, value): return issubclass(value.dtype.type, np.integer) class BoolBlock(Block): _can_hold_na = False + def _can_hold_element(self, element): + return isinstance(element, (int, bool)) + + def _try_cast(self, element): + try: + return bool(element) + except: + return element + def should_store(self, value): return issubclass(value.dtype.type, np.bool_) class ObjectBlock(Block): _can_hold_na = True + def _can_hold_element(self, element): + return True + + def _try_cast(self, element): + return element + def should_store(self, value): return not issubclass(value.dtype.type, - (np.integer, np.floating, np.bool_)) + (np.integer, np.floating, np.complexfloating, + np.bool_)) class DatetimeBlock(IntBlock): _can_hold_na = True @@ -279,6 +376,8 @@ def make_block(values, items, ref_items, do_integrity_check=False): if issubclass(vtype, np.floating): klass = FloatBlock + elif issubclass(vtype, np.complexfloating): + klass = ComplexBlock elif issubclass(vtype, np.datetime64): klass = DatetimeBlock elif issubclass(vtype, np.integer): @@ -423,7 +522,7 @@ def is_consolidated(self): def get_numeric_data(self, copy=False): num_blocks = [b for b in self.blocks - if isinstance(b, (IntBlock, FloatBlock))] + if isinstance(b, (IntBlock, FloatBlock, ComplexBlock))] indexer = np.sort(np.concatenate([b.ref_locs for b in num_blocks])) new_items = self.items.take(indexer) @@ -940,9 +1039,6 @@ def add_suffix(self, suffix): return self.rename_items(f) def fillna(self, value, inplace=False): - """ - - """ new_blocks = [b.fillna(value, inplace=inplace) if b._can_hold_na else b for b in self.blocks] @@ -950,6 +1046,14 @@ def fillna(self, value, inplace=False): return self return BlockManager(new_blocks, self.axes) + def replace(self, to_replace, value, inplace=False): + new_blocks = [b.replace(to_replace, value, inplace=inplace) + if b._can_hold_na else b + for b in self.blocks] + if inplace: + return self + return BlockManager(new_blocks, self.axes) + @property def block_id_vector(self): # TODO @@ -987,6 +1091,7 @@ def form_blocks(data, axes): # put "leftover" items in float bucket, where else? # generalize? float_dict = {} + complex_dict = {} int_dict = {} bool_dict = {} object_dict = {} @@ -994,6 +1099,8 @@ def form_blocks(data, axes): for k, v in data.iteritems(): if issubclass(v.dtype.type, np.floating): float_dict[k] = v + elif issubclass(v.dtype.type, np.complexfloating): + complex_dict[k] = v elif issubclass(v.dtype.type, np.datetime64): datetime_dict[k] = v elif issubclass(v.dtype.type, np.integer): @@ -1008,12 +1115,17 @@ def form_blocks(data, axes): float_block = _simple_blockify(float_dict, items, np.float64) blocks.append(float_block) + if len(complex_dict): + complex_block = _simple_blockify(complex_dict, items, np.complex128) + blocks.append(complex_block) + if len(int_dict): int_block = _simple_blockify(int_dict, items, np.int64) blocks.append(int_block) if len(datetime_dict): - datetime_block = _simple_blockify(datetime_dict, items, np.datetime64) + datetime_block = _simple_blockify(datetime_dict, items, + np.dtype('M8[ns]')) blocks.append(datetime_block) if len(bool_dict): @@ -1095,8 +1207,9 @@ def _interleaved_dtype(blocks): have_bool = counts[BoolBlock] > 0 have_object = counts[ObjectBlock] > 0 have_float = counts[FloatBlock] > 0 + have_complex = counts[ComplexBlock] > 0 have_dt64 = counts[DatetimeBlock] > 0 - have_numeric = have_float or have_int + have_numeric = have_float or have_complex or have_int if have_object: return np.object_ @@ -1104,10 +1217,12 @@ def _interleaved_dtype(blocks): return np.object_ elif have_bool: return np.bool_ - elif have_int and not have_float: + elif have_int and not have_float and not have_complex: return np.int64 - elif have_dt64 and not have_float: + elif have_dt64 and not have_float and not have_complex: return np.datetime64 + elif have_complex: + return np.complex128 else: return np.float64 @@ -1115,7 +1230,7 @@ def _consolidate(blocks, items): """ Merge blocks having same dtype """ - get_dtype = lambda x: x.dtype + get_dtype = lambda x: x.dtype.name # sort by dtype grouper = itertools.groupby(sorted(blocks, key=get_dtype), diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 1237d7314af29..ad65a589cddfe 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -405,11 +405,10 @@ def unique1d(values): uniques = np.array(table.unique(com._ensure_int64(values)), dtype=np.int64) - if values.dtype == np.datetime64: - uniques = uniques.view('M8[us]') + if issubclass(values.dtype.type, np.datetime_): + uniques = uniques.view('M8[ns]') else: table = lib.PyObjectHashTable(len(values)) uniques = table.unique(com._ensure_object(values)) uniques = lib.list_to_object_array(uniques) return uniques - diff --git a/pandas/core/series.py b/pandas/core/series.py index c0de6aa21826d..03ec825034e2d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -609,6 +609,14 @@ def astype(self, dtype): casted = com._astype_nansafe(self.values, dtype) return self._constructor(casted, index=self.index, name=self.name) + def repeat(self, reps): + """ + See ndarray.repeat + """ + new_index = self.index.repeat(reps) + new_values = self.values.repeat(reps) + return Series(new_values, index=new_index, name=self.name) + def reshape(self, newshape, order='C'): """ See numpy.ndarray.reshape @@ -911,6 +919,77 @@ def to_dict(self): """ return dict(self.iteritems()) + @classmethod + def from_json(cls, json, orient="index", dtype=None, numpy=True): + """ + Convert JSON string to Series + + Parameters + ---------- + json : The JSON string to parse. + orient : {'split', 'records', 'index'}, default 'index' + The format of the JSON string + split : dict like + {index -> [index], name -> name, data -> [values]} + records : list like [value, ... , value] + index : dict like {index -> value} + dtype : dtype of the resulting Series + nupmpy: direct decoding to numpy arrays. default True but falls back + to standard decoding if a problem occurs. + + Returns + ------- + result : Series + """ + from pandas._ujson import loads + s = None + + if numpy: + try: + if orient == "split": + s = Series(**loads(json, dtype=dtype, numpy=True)) + elif orient == "columns" or orient == "index": + s = Series(*loads(json, dtype=dtype, numpy=True, + labelled=True)) + else: + s = Series(loads(json, dtype=dtype, numpy=True)) + except ValueError: + numpy = False + if not numpy: + if orient == "split": + s = Series(dtype=dtype, **loads(json)) + else: + s = Series(loads(json), dtype=dtype) + + return s + + def to_json(self, orient="index", double_precision=10, force_ascii=True): + """ + Convert Series to a JSON string + + Note NaN's and None will be converted to null and datetime objects + will be converted to UNIX timestamps. + + Parameters + ---------- + orient : {'split', 'records', 'index'}, default 'index' + The format of the JSON string + split : dict like + {index -> [index], name -> name, data -> [values]} + records : list like [value, ... , value] + index : dict like {index -> value} + double_precision : The number of decimal places to use when encoding + floating point values, default 10. + force_ascii : force encoded string to be ASCII, default True. + + Returns + ------- + result : JSON compatible string + """ + from pandas._ujson import dumps + return dumps(self, orient=orient, double_precision=double_precision, + ensure_ascii=force_ascii) + def to_sparse(self, kind='block', fill_value=None): """ Convert Series to SparseSeries @@ -2061,20 +2140,15 @@ def fillna(self, value=None, method='pad', inplace=False, ------- filled : Series """ - mask = isnull(self.values) - if value is not None: result = self.copy() if not inplace else self + mask = isnull(self.values) np.putmask(result, mask, value) else: if method is None: # pragma: no cover raise ValueError('must specify a fill method') - method = com._clean_fill_method(method) - if method == 'pad': - fill_f = com.pad_1d - elif method == 'backfill': - fill_f = com.backfill_1d + fill_f = _get_fill_func(method) if inplace: values = self.values @@ -2090,6 +2164,92 @@ def fillna(self, value=None, method='pad', inplace=False, return result + + def replace(self, to_replace, value=None, method='pad', inplace=False, + limit=None): + """ + Replace arbitrary values in a Series + + Parameters + ---------- + to_replace : list or dict + list of values to be replaced or dict of replacement values + value : anything + if to_replace is a list then value is the replacement value + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default 'pad' + Method to use for filling holes in reindexed Series + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + inplace : boolean, default False + If True, fill the Series in place. Note: this will modify any other + views on this Series, for example a column in a DataFrame. Returns + a reference to the filled object, which is self if inplace=True + limit : int, default None + Maximum size gap to forward or backward fill + + Notes + ----- + replace does not distinguish between NaN and None + + See also + -------- + fillna, reindex, asfreq + + Returns + ------- + replaced : Series + """ + result = self.copy() if not inplace else self + + def _rep_one(s, to_rep, v): # replace single value + mask = com.mask_missing(s.values, to_rep) + np.putmask(s.values, mask, v) + return s + + def _rep_dict(rs, to_rep): # replace {[src] -> dest} + + dd = {} # group by unique destination value + [dd.setdefault(d, []).append(s) for s, d in to_rep.iteritems()] + + for d, sset in dd.iteritems(): # now replace by each dest + rs = _rep_one(rs, sset, d) + return rs + + if np.isscalar(to_replace): + to_replace = [to_replace] + + if isinstance(to_replace, dict): + return _rep_dict(result, to_replace) + + if isinstance(to_replace, (list, np.ndarray)): + + if isinstance(value, (list, np.ndarray)): # check same length + vl, rl = len(value), len(to_replace) + if vl == rl: + return _rep_dict(result, dict(zip(to_replace, value))) + raise ValueError('Got %d to replace but %d values' % (rl, vl)) + + elif value is not None: # otherwise all replaced with same value + + return _rep_one(result, to_replace, value) + + else: # method + if method is None: # pragma: no cover + raise ValueError('must specify a fill method') + fill_f = _get_fill_func(method) + + mask = com.mask_missing(result, to_replace) + fill_f(result.values, limit=limit, mask=mask) + + if not inplace: + result = Series(result.values, index=self.index, + name=self.name) + return result + + + raise ValueError('Unrecognized to_replace type %s' % + type(to_replace)) + def isin(self, values): """ Return boolean vector showing whether each element in the Series is @@ -2541,6 +2701,13 @@ def _resolve_offset(freq, kwds): return offset +def _get_fill_func(method): + method = com._clean_fill_method(method) + if method == 'pad': + fill_f = com.pad_1d + elif method == 'backfill': + fill_f = com.backfill_1d + return fill_f #---------------------------------------------------------------------- # Add plotting methods to Series diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index e218fdce98380..aeb36963c69c8 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -24,6 +24,9 @@ def next(x): from pandas.util.decorators import Appender +class DateConversionError(Exception): + pass + _parser_params = """Also supports optionally iterating or breaking of the file into chunks. @@ -46,8 +49,12 @@ def next(x): na_values : list-like or dict, default None Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values -parse_dates : boolean or list of column numbers/name, default False - Attempt to parse dates in the indicated columns +parse_dates : boolean, list of ints or names, list of lists, or dict + True -> try parsing all columns + [1, 2, 3] -> try parsing columns 1, 2, 3 + [[1, 3]] -> combine columns 1 and 3 and parse as date (for dates split + across multiple columns), and munge column names + {'foo' : [1, 3]} -> parse columns 1, 3 as date and call result 'foo' date_parser : function Function to use for converting dates to strings. Defaults to dateutil.parser @@ -155,7 +162,8 @@ def _read(cls, filepath_or_buffer, kwds): f = com._get_handle(filepath_or_buffer, 'r', encoding=encoding) if kwds.get('date_parser', None) is not None: - kwds['parse_dates'] = True + if isinstance(kwds['parse_dates'], bool): + kwds['parse_dates'] = True # Extract some of the arguments (pass chunksize on). kwds.pop('filepath_or_buffer') @@ -362,8 +370,8 @@ class TextParser(object): def __init__(self, f, delimiter=None, names=None, header=0, index_col=None, na_values=None, thousands=None, comment=None, parse_dates=False, - date_parser=None, dayfirst=False, chunksize=None, - skiprows=None, skip_footer=0, converters=None, + date_parser=None, dayfirst=False, + chunksize=None, skiprows=None, skip_footer=0, converters=None, verbose=False, encoding=None): """ Workhorse function for processing nested list into DataFrame @@ -672,7 +680,6 @@ def get_chunk(self, rows=None): zipped_content = list(lib.to_object_array(content).T) - # no index column specified, so infer that's what is wanted if self.index_col is not None: if np.isscalar(self.index_col): index = zipped_content.pop(self.index_col) @@ -686,9 +693,8 @@ def get_chunk(self, rows=None): zipped_content.pop(i) if np.isscalar(self.index_col): - if self._should_parse_dates(0): - index = lib.try_parse_dates(index, parser=self.date_parser, - dayfirst=self.dayfirst) + if self._should_parse_dates(self.index_col): + index = self._conv_date(index) index, na_count = _convert_types(index, self.na_values) index = Index(index, name=self.index_name) if self.verbose and na_count: @@ -696,9 +702,8 @@ def get_chunk(self, rows=None): else: arrays = [] for i, arr in enumerate(index): - if self._should_parse_dates(i): - arr = lib.try_parse_dates(arr, parser=self.date_parser, - dayfirst=self.dayfirst) + if self._should_parse_dates(self.index_col[i]): + arr = self._conv_date(arr) arr, _ = _convert_types(arr, self.na_values) arrays.append(arr) index = MultiIndex.from_arrays(arrays, names=self.index_name) @@ -736,18 +741,13 @@ def get_chunk(self, rows=None): col = self.columns[col] data[col] = lib.map_infer(data[col], f) - if not isinstance(self.parse_dates, bool): - for x in self.parse_dates: - if isinstance(x, int) and x not in data: - x = self.orig_columns[x] - if x in self.index_col or x in self.index_name: - continue - data[x] = lib.try_parse_dates(data[x], parser=self.date_parser, - dayfirst=self.dayfirst) + columns = self.columns + if self.parse_dates is not None: + data, columns = self._process_date_conversion(data) data = _convert_to_ndarrays(data, self.na_values, self.verbose) - return DataFrame(data=data, columns=self.columns, index=index) + return DataFrame(data=data, columns=columns, index=index) def _find_line_number(self, exp_len, chunk_len, chunk_i): if exp_len is None: @@ -778,6 +778,68 @@ def _should_parse_dates(self, i): name = self.index_name[i] return i in to_parse or name in to_parse + def _conv_date(self, *date_cols): + if self.date_parser is None: + return lib.try_parse_dates(_concat_date_cols(date_cols), + dayfirst=self.dayfirst) + else: + try: + return self.date_parser(*date_cols) + except: + return lib.try_parse_dates(_concat_date_cols(date_cols), + parser=self.date_parser, + dayfirst=self.dayfirst) + + def _process_date_conversion(self, data_dict): + new_cols = [] + new_data = {} + columns = self.columns + + if self.parse_dates is None or isinstance(self.parse_dates, bool): + return data_dict, columns + + if isinstance(self.parse_dates, list): + # list of column lists + for colspec in self.parse_dates: + if np.isscalar(colspec): + if isinstance(colspec, int) and colspec not in data_dict: + colspec = self.orig_columns[colspec] + if self._isindex(colspec): + continue + data_dict[colspec] = self._conv_date(data_dict[colspec]) + else: + new_name, col = _try_convert_dates(self._conv_date, colspec, + data_dict, self.orig_columns) + if new_name in data_dict: + raise ValueError('New date column already in dict %s' % + new_name) + new_data[new_name] = col + new_cols.append(new_name) + + elif isinstance(self.parse_dates, dict): + # dict of new name to column list + for new_name, colspec in self.parse_dates.iteritems(): + if new_name in data_dict: + raise ValueError('Date column %s already in dict' % + new_name) + + _, col = _try_convert_dates(self._conv_date, colspec, data_dict, + self.orig_columns) + new_data[new_name] = col + new_cols.append(new_name) + + data_dict.update(new_data) + new_cols.extend(columns) + return data_dict, new_cols + + def _isindex(self, colspec): + return (colspec == self.index_col or + (isinstance(self.index_col, list) and + colspec in self.index_col) or + (colspec == self.index_name or + (isinstance(self.index_name, list) and + colspec in self.index_name))) + def _get_lines(self, rows=None): source = self.data lines = self.buf @@ -860,6 +922,34 @@ def _convert_types(values, na_values): return result, na_count +def _get_col_names(colspec, columns): + colset = set(columns) + colnames = [] + for c in colspec: + if c in colset: + colnames.append(str(c)) + elif isinstance(c, int): + colnames.append(str(columns[c])) + return colnames + +def _try_convert_dates(parser, colspec, data_dict, columns): + colspec = _get_col_names(colspec, columns) + new_name = '_'.join(colspec) + + to_parse = [data_dict[c] for c in colspec if c in data_dict] + try: + new_col = parser(*to_parse) + except DateConversionError: + new_col = parser(_concat_date_cols(to_parse)) + return new_name, new_col + +def _concat_date_cols(date_cols): + if len(date_cols) == 1: + return date_cols[0] + + # stripped = [map(str.strip, x) for x in date_cols] + return np.array([' '.join(x) for x in zip(*date_cols)], dtype=object) + class FixedWidthReader(object): """ @@ -1001,7 +1091,7 @@ def _parse_xls(self, sheetname, header=0, skiprows=None, index_col=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, chunksize=None): from datetime import MINYEAR, time, datetime - from xlrd import xldate_as_tuple, XL_CELL_DATE + from xlrd import xldate_as_tuple, XL_CELL_DATE, XL_CELL_ERROR datemode = self.book.datemode sheet = self.book.sheet_by_name(sheetname) @@ -1017,6 +1107,8 @@ def _parse_xls(self, sheetname, header=0, skiprows=None, index_col=None, value = time(*dt[3:]) else: value = datetime(*dt) + if typ == XL_CELL_ERROR: + value = np.nan row.append(value) data.append(row) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 65baa69d7c50c..b8724e854c7ba 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -579,12 +579,15 @@ def _write_index(self, group, key, index): node._v_attrs.kind = kind node._v_attrs.name = index.name - if isinstance(index, (DatetimeIndex, PeriodIndex, IntIndex)): - node._v_attrs.index_class = type(index) + if isinstance(index, (DatetimeIndex, PeriodIndex)): + node._v_attrs.index_class = _class_to_alias(type(index)) if hasattr(index, 'freq'): node._v_attrs.freq = index.freq + if hasattr(index, 'tz') and index.tz is not None: + node._v_attrs.tz = index.tz.zone + def _read_index(self, group, key): variety = getattr(group._v_attrs, '%s_variety' % key) @@ -667,16 +670,21 @@ def _read_index_node(self, node): if 'name' in node._v_attrs: name = node._v_attrs.name - index_class = getattr(node._v_attrs, 'index_class', Index) + index_class = _alias_to_class(getattr(node._v_attrs, 'index_class', '')) + factory = _get_index_factory(index_class) + kwargs = {} if 'freq' in node._v_attrs: kwargs['freq'] = node._v_attrs['freq'] + if 'tz' in node._v_attrs: + kwargs['tz'] = node._v_attrs['tz'] + if kind in ('date', 'datetime'): - index = index_class(_unconvert_index(data, kind), dtype=object, - **kwargs) + index = factory(_unconvert_index(data, kind), dtype=object, + **kwargs) else: - index = index_class(_unconvert_index(data, kind), **kwargs) + index = factory(_unconvert_index(data, kind), **kwargs) index.name = name @@ -831,8 +839,7 @@ def _read_panel_table(self, group, where=None): columns = _maybe_convert(sel.values['column'], table._v_attrs.columns_kind) - index = _maybe_convert(sel.values['index'], - table._v_attrs.index_kind) + index = _maybe_convert(sel.values['index'], table._v_attrs.index_kind) values = sel.values['values'] major = Factor(index) @@ -842,7 +849,8 @@ def _read_panel_table(self, group, where=None): key = major.labels * K + minor.labels if len(unique(key)) == len(key): - sorter, _ = lib.groupsort_indexer(key, J * K) + sorter, _ = lib.groupsort_indexer(com._ensure_int64(key), J * K) + sorter = com._ensure_platform_int(sorter) # the data need to be sorted sorted_values = values.take(sorter, axis=0) @@ -871,6 +879,7 @@ def _read_panel_table(self, group, where=None): unique_tuples = _asarray_tuplesafe(unique_tuples) indexer = match(unique_tuples, tuple_index) + indexer = com._ensure_platform_int(indexer) new_index = long_index.take(indexer) new_values = lp.values.take(indexer, axis=0) @@ -953,7 +962,7 @@ def _read_array(group, key): def _unconvert_index(data, kind): if kind == 'datetime64': - index = np.array(data, dtype='M8[us]') + index = np.array(data, dtype='M8[ns]') elif kind == 'datetime': index = np.array([datetime.fromtimestamp(v) for v in data], dtype=object) @@ -985,7 +994,7 @@ def _maybe_convert(values, val_kind): def _get_converter(kind): if kind == 'datetime64': - return lambda x: np.datetime64(x) + return lambda x: np.array(x, dtype='M8[ns]') if kind == 'datetime': return lib.convert_timestamps else: # pragma: no cover @@ -1003,6 +1012,21 @@ def _is_table_type(group): # new node, e.g. return False +_index_type_map = {DatetimeIndex : 'datetime', + PeriodIndex : 'period'} + +_reverse_index_map = {} +for k, v in _index_type_map.iteritems(): + _reverse_index_map[v] = k + +def _class_to_alias(cls): + return _index_type_map.get(cls, '') + +def _alias_to_class(alias): + if isinstance(alias, type): + return alias + return _reverse_index_map.get(alias, Index) + class Selection(object): """ Carries out a selection operation on a tables.Table object. @@ -1044,7 +1068,7 @@ def generate(self, where): field = c['field'] if field == 'index' and self.index_kind == 'datetime64': - val = np.datetime64(value).view('i8') + val = lib.Timestamp(value).value self.conditions.append('(%s %s %s)' % (field,op,val)) elif field == 'index' and isinstance(value, datetime): value = time.mktime(value.timetuple()) @@ -1085,3 +1109,11 @@ def select_coords(self): """ self.values = self.table.getWhereList(self.the_condition) +def _get_index_factory(klass): + if klass == DatetimeIndex: + def f(values, freq=None, tz=None): + return DatetimeIndex._simple_new(values, None, freq=freq, + tz=tz) + return f + return klass + diff --git a/pandas/io/tests/test3.xls b/pandas/io/tests/test3.xls new file mode 100644 index 0000000000000..f73943d677951 Binary files /dev/null and b/pandas/io/tests/test3.xls differ diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 02fc25329e7bc..f07e95cb2ffb3 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -12,6 +12,7 @@ import numpy as np from pandas import DataFrame, Index, isnull +import pandas.io.parsers as parsers from pandas.io.parsers import (read_csv, read_table, read_fwf, ExcelFile, TextParser) from pandas.util.testing import assert_almost_equal, assert_frame_equal, network @@ -90,6 +91,75 @@ def test_comment_fwf(self): comment='#') assert_almost_equal(df.values, expected) + def test_multiple_date_col(self): + # Can use multiple date parsers + data = """\ +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + def func(*date_cols): + return lib.try_parse_dates(parsers._concat_date_cols(date_cols)) + + df = read_csv(StringIO(data), header=None, + date_parser=func, + parse_dates={'nominal' : [1, 2], + 'actual' : [1,3]}) + self.assert_('nominal' in df) + self.assert_('actual' in df) + from datetime import datetime + d = datetime(1999, 1, 27, 19, 0) + self.assert_(df.ix[0, 'nominal'] == d) + + data = """\ +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + df = read_csv(StringIO(data), header=None, + parse_dates=[[1, 2], [1,3]]) + self.assert_('X.2_X.3' in df) + self.assert_('X.2_X.4' in df) + from datetime import datetime + d = datetime(1999, 1, 27, 19, 0) + self.assert_(df.ix[0, 'X.2_X.3'] == d) + + data = '''\ +KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +''' + df = read_csv(StringIO(data), sep=',', header=None, + parse_dates=[1], index_col=1) + from datetime import datetime + d = datetime(1999, 1, 27, 19, 0) + self.assert_(df.index[0] == d) + + def test_multiple_date_cols_with_header(self): + data = """\ +ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" + + df = read_csv(StringIO(data), parse_dates={'nominal': [1, 2]}) + self.assert_(not isinstance(df.nominal[0], basestring)) + + def test_multiple_skts_example(self): + data = "year, month, a, b\n 2001, 01, 0.0, 10.\n 2001, 02, 1.1, 11." + pass + def test_malformed(self): # all data = """ignore @@ -306,7 +376,8 @@ def test_parse_dates_column_list(self): lev = expected.index.levels[0] expected.index.levels[0] = lev.to_datetime(dayfirst=True) expected['aux_date'] = to_datetime(expected['aux_date'], - dayfirst=True).astype('O') + dayfirst=True) + expected['aux_date'] = map(Timestamp, expected['aux_date']) self.assert_(isinstance(expected['aux_date'][0], datetime)) df = read_csv(StringIO(data), sep=";", index_col = range(4), @@ -377,6 +448,17 @@ def test_excel_stop_iterator(self): expected = DataFrame([['aaaa','bbbbb']], columns=['Test', 'Test1']) assert_frame_equal(parsed, expected) + def test_excel_cell_error_na(self): + try: + import xlrd + except ImportError: + raise nose.SkipTest('xlrd not installed, skipping') + + excel_data = ExcelFile(os.path.join(self.dirpath, 'test3.xls')) + parsed = excel_data.parse('Sheet1') + expected = DataFrame([[np.nan]], columns=['Test']) + assert_frame_equal(parsed, expected) + def test_excel_table(self): try: import xlrd diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 44dff4c4810b5..6cb97e3bcf082 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -8,7 +8,8 @@ from datetime import datetime import numpy as np -from pandas import Series, DataFrame, Panel, MultiIndex, bdate_range +from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range, + date_range) from pandas.io.pytables import HDFStore, get_store import pandas.util.testing as tm from pandas.tests.test_series import assert_series_equal @@ -338,6 +339,19 @@ def test_can_serialize_dates(self): frame = DataFrame(np.random.randn(len(rng), 4), index=rng) self._check_roundtrip(frame, tm.assert_frame_equal) + def test_timezones(self): + rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern') + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + try: + store = HDFStore(self.scratchpath) + store['frame'] = frame + recons = store['frame'] + self.assert_(recons.index.equals(rng)) + self.assertEquals(rng.tz, recons.index.tz) + finally: + store.close() + os.remove(self.scratchpath) + def test_store_hierarchical(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], diff --git a/pandas/rpy/common.py b/pandas/rpy/common.py index afd1f57306b54..4d3620536f2cd 100644 --- a/pandas/rpy/common.py +++ b/pandas/rpy/common.py @@ -12,7 +12,8 @@ from rpy2.robjects import r import rpy2.robjects as robj -__all__ = ['convert_robj', 'load_data'] +__all__ = ['convert_robj', 'load_data', 'convert_to_r_dataframe', + 'convert_to_r_matrix'] def load_data(name, package=None, convert=True): if package: @@ -173,6 +174,81 @@ def convert_robj(obj, use_pandas=True): raise Exception('Do not know what to do with %s object' % type(obj)) +VECTOR_TYPES = {np.float64: robj.FloatVector, + np.float32: robj.FloatVector, + np.float: robj.FloatVector, + np.int: robj.IntVector, + np.int32: robj.IntVector, + np.int64: robj.IntVector, + np.object_: robj.StrVector, + np.str: robj.StrVector} + +def convert_to_r_dataframe(df, strings_as_factors=False): + """ + Convert a pandas DataFrame to a R data.frame. + + Parameters + ---------- + df: The DataFrame being converted + strings_as_factors: Whether to turn strings into R factors (default: False) + + Returns + ------- + A R data.frame + + """ + + import rpy2.rlike.container as rlc + + columns = rlc.OrdDict() + + #FIXME: This doesn't handle MultiIndex + + for column in df: + value = df[column] + value_type = value.dtype.type + value = [item if pn.notnull(item) else robj.NA_Logical + for item in value] + value = VECTOR_TYPES[value_type](value) + + if not strings_as_factors: + I = robj.baseenv.get("I") + value = I(value) + + columns[column] = value + + r_dataframe = robj.DataFrame(columns) + + del columns + + r_dataframe.rownames = robj.StrVector(df.index) + + return r_dataframe + + +def convert_to_r_matrix(df, strings_as_factors=False): + + """ + Convert a pandas DataFrame to a R matrix. + + Parameters + ---------- + df: The DataFrame being converted + strings_as_factors: Whether to turn strings into R factors (default: False) + + Returns + ------- + A R matrix + + """ + + r_dataframe = convert_to_r_dataframe(df, strings_as_factors) + as_matrix = robj.baseenv.get("as.matrix") + r_matrix = as_matrix(r_dataframe) + + return r_matrix + + def test_convert_list(): obj = r('list(a=1, b=2, c=3)') @@ -213,6 +289,37 @@ def test_convert_matrix(): assert np.array_equal(converted.index, ['a', 'b', 'c']) assert np.array_equal(converted.columns, ['one', 'two', 'three']) +def test_convert_r_dataframe(): + + seriesd = _test.getSeriesData() + frame = pn.DataFrame(seriesd, columns=['D', 'C', 'B', 'A']) + + r_dataframe = convert_to_r_dataframe(frame) + + assert np.array_equal(convert_robj(r_dataframe.rownames), frame.index) + assert np.array_equal(convert_robj(r_dataframe.colnames), frame.columns) + + for column in r_dataframe.colnames: + coldata = r_dataframe.rx2(column) + original_data = frame[column] + assert np.array_equal(convert_robj(coldata), original_data) + +def test_convert_r_matrix(): + + seriesd = _test.getSeriesData() + frame = pn.DataFrame(seriesd, columns=['D', 'C', 'B', 'A']) + + r_dataframe = convert_to_r_matrix(frame) + + assert np.array_equal(convert_robj(r_dataframe.rownames), frame.index) + assert np.array_equal(convert_robj(r_dataframe.colnames), frame.columns) + + for column in r_dataframe.colnames: + coldata = r_dataframe.rx(True, column) + original_data = frame[column] + assert np.array_equal(convert_robj(coldata), original_data) + + if __name__ == '__main__': pass diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index 9291d90765377..673d759de2f10 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -741,6 +741,23 @@ def apply(self, func, axis=0, broadcast=False): else: return self._apply_broadcast(func, axis) + def applymap(self, func): + """ + Apply a function to a DataFrame that is intended to operate + elementwise, i.e. like doing map(func, series) for each series in the + DataFrame + + Parameters + ---------- + func : function + Python function, returns a single value from a single value + + Returns + ------- + applied : DataFrame + """ + return self.apply(lambda x: map(func, x)) + @Appender(DataFrame.fillna.__doc__) def fillna(self, value=None, method='pad', inplace=False, limit=None): new_series = {} diff --git a/pandas/sparse/tests/test_sparse.py b/pandas/sparse/tests/test_sparse.py index 6bb6dd129c771..48d8bc0f77ca6 100644 --- a/pandas/sparse/tests/test_sparse.py +++ b/pandas/sparse/tests/test_sparse.py @@ -432,7 +432,6 @@ def test_operators_corner2(self): result = val - self.zbseries assert_sp_series_equal(result, 3 - self.zbseries) - def test_binary_operators(self): def _check_inplace_op(op): tmp = self.bseries.copy() diff --git a/pandas/src/datetime.pxd b/pandas/src/datetime.pxd index ae37c3cbadefa..e71139f0ab5ab 100644 --- a/pandas/src/datetime.pxd +++ b/pandas/src/datetime.pxd @@ -1,6 +1,13 @@ -from numpy cimport int64_t +from numpy cimport int64_t, int32_t, npy_int64, npy_int32 from cpython cimport PyObject + +cdef extern from "stdint.h": + enum: INT64_MIN + enum: INT32_MIN + + + cdef extern from "datetime.h": ctypedef class datetime.date [object PyDateTime_Date]: @@ -35,26 +42,6 @@ cdef extern from "numpy/ndarrayobject.h": ctypedef int64_t npy_timedelta ctypedef int64_t npy_datetime - ctypedef struct npy_datetimestruct: - int64_t year - int month, day, hour, min, sec, us, ps, as - - ctypedef enum NPY_DATETIMEUNIT: - #NPY_FR_Y - #NPY_FR_M - #NPY_FR_W - #NPY_FR_B - #NPY_FR_D - #NPY_FR_h - #NPY_FR_m - #NPY_FR_s - #NPY_FR_ms - NPY_FR_us - #NPY_FR_ns - #NPY_FR_ps - #NPY_FR_fs - #NPY_FR_as - ctypedef enum NPY_CASTING: NPY_NO_CASTING NPY_EQUIV_CASTING @@ -62,12 +49,6 @@ cdef extern from "numpy/ndarrayobject.h": NPY_SAME_KIND_CASTING NPY_UNSAFE_CASTING - npy_datetime PyArray_DatetimeStructToDatetime(NPY_DATETIMEUNIT fr, - npy_datetimestruct *d) - - void PyArray_DatetimeToDatetimeStruct(npy_datetime val, - NPY_DATETIMEUNIT fr, - npy_datetimestruct *result) cdef extern from "numpy_helper.h": npy_datetime unbox_datetime64_scalar(object o) @@ -78,9 +59,36 @@ cdef extern from "numpy/npy_common.h": cdef extern from "np_datetime.h": - int convert_pydatetime_to_datetimestruct(PyObject *obj, npy_datetimestruct *out, - NPY_DATETIMEUNIT *out_bestunit, + ctypedef enum PANDAS_DATETIMEUNIT: + PANDAS_FR_Y + PANDAS_FR_M + PANDAS_FR_W + PANDAS_FR_D + PANDAS_FR_B + PANDAS_FR_h + PANDAS_FR_m + PANDAS_FR_s + PANDAS_FR_ms + PANDAS_FR_us + PANDAS_FR_ns + PANDAS_FR_ps + PANDAS_FR_fs + PANDAS_FR_as + + ctypedef struct pandas_datetimestruct: + npy_int64 year + npy_int32 month, day, hour, min, sec, us, ps, as + + int convert_pydatetime_to_datetimestruct(PyObject *obj, + pandas_datetimestruct *out, + PANDAS_DATETIMEUNIT *out_bestunit, int apply_tzinfo) + + npy_datetime pandas_datetimestruct_to_datetime(PANDAS_DATETIMEUNIT fr, + pandas_datetimestruct *d) + void pandas_datetime_to_datetimestruct(npy_datetime val, + PANDAS_DATETIMEUNIT fr, + pandas_datetimestruct *result) int _days_per_month_table[2][12] int dayofweek(int y, int m, int d) @@ -88,18 +96,18 @@ cdef extern from "np_datetime.h": cdef extern from "np_datetime_strings.h": - int parse_iso_8601_datetime(char *str, int len, NPY_DATETIMEUNIT unit, - NPY_CASTING casting, npy_datetimestruct *out, - npy_bool *out_local, NPY_DATETIMEUNIT *out_bestunit, + int parse_iso_8601_datetime(char *str, int len, PANDAS_DATETIMEUNIT unit, + NPY_CASTING casting, pandas_datetimestruct *out, + npy_bool *out_local, PANDAS_DATETIMEUNIT *out_bestunit, npy_bool *out_special) - int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, - int local, NPY_DATETIMEUNIT base, int tzoffset, + int make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, + int local, PANDAS_DATETIMEUNIT base, int tzoffset, NPY_CASTING casting) - int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) + int get_datetime_iso_8601_strlen(int local, PANDAS_DATETIMEUNIT base) - # int parse_python_string(object obj, npy_datetimestruct *out) except -1 + # int parse_python_string(object obj, pandas_datetimestruct *out) except -1 cdef extern from "period.h": ctypedef struct date_info: @@ -128,36 +136,32 @@ cdef extern from "period.h": ctypedef int64_t (*freq_conv_func)(int64_t, char, asfreq_info*) - int64_t asfreq(int64_t dtordinal, int freq1, int freq2, char relation) except -1 - freq_conv_func get_asfreq_func(int fromFreq, int toFreq, int forConvert) + int64_t asfreq(int64_t dtordinal, int freq1, int freq2, char relation) except INT32_MIN + freq_conv_func get_asfreq_func(int fromFreq, int toFreq) void get_asfreq_info(int fromFreq, int toFreq, asfreq_info *af_info) int64_t get_period_ordinal(int year, int month, int day, int hour, int minute, int second, - int freq) except -1 + int freq) except INT32_MIN - int64_t get_python_ordinal(int64_t period_ordinal, int freq) except -1 + int64_t get_python_ordinal(int64_t period_ordinal, int freq) except INT32_MIN char *skts_strftime(int64_t value, int freq, PyObject *args) char *period_to_string(int64_t value, int freq) char *period_to_string2(int64_t value, int freq, char *fmt) - int get_date_info(int64_t ordinal, int freq, date_info *dinfo) except -1 + int get_date_info(int64_t ordinal, int freq, date_info *dinfo) except INT32_MIN double getAbsTime(int, int64_t, int64_t) - int pyear(int64_t ordinal, int freq) except -1 - int pqyear(int64_t ordinal, int freq) except -1 - int pquarter(int64_t ordinal, int freq) except -1 - int pmonth(int64_t ordinal, int freq) except -1 - int pday(int64_t ordinal, int freq) except -1 - int pweekday(int64_t ordinal, int freq) except -1 - int pday_of_week(int64_t ordinal, int freq) except -1 - int pday_of_year(int64_t ordinal, int freq) except -1 - int pweek(int64_t ordinal, int freq) except -1 - int phour(int64_t ordinal, int freq) except -1 - int pminute(int64_t ordinal, int freq) except -1 - int psecond(int64_t ordinal, int freq) except -1 - -cdef extern from "stdint.h": - enum: INT64_MIN - + int pyear(int64_t ordinal, int freq) except INT32_MIN + int pqyear(int64_t ordinal, int freq) except INT32_MIN + int pquarter(int64_t ordinal, int freq) except INT32_MIN + int pmonth(int64_t ordinal, int freq) except INT32_MIN + int pday(int64_t ordinal, int freq) except INT32_MIN + int pweekday(int64_t ordinal, int freq) except INT32_MIN + int pday_of_week(int64_t ordinal, int freq) except INT32_MIN + int pday_of_year(int64_t ordinal, int freq) except INT32_MIN + int pweek(int64_t ordinal, int freq) except INT32_MIN + int phour(int64_t ordinal, int freq) except INT32_MIN + int pminute(int64_t ordinal, int freq) except INT32_MIN + int psecond(int64_t ordinal, int freq) except INT32_MIN diff --git a/pandas/src/datetime.pyx b/pandas/src/datetime.pyx index 42bc2a8fd71f0..93e1ced2d2e64 100644 --- a/pandas/src/datetime.pyx +++ b/pandas/src/datetime.pyx @@ -9,12 +9,16 @@ from cpython cimport * from datetime cimport * from util cimport is_integer_object, is_datetime64_object +from datetime import timedelta from dateutil.parser import parse as parse_date cimport util +from khash cimport * +import cython + # initialize numpy import_array() -import_ufunc() +#import_ufunc() # import datetime C API PyDateTime_IMPORT @@ -44,10 +48,9 @@ except NameError: # py3 # This serves as the box for datetime64 class Timestamp(_Timestamp): - __slots__ = ['value', 'offset'] - def __new__(cls, object ts_input, object offset=None, tz=None): cdef _TSObject ts + cdef _Timestamp ts_base if isinstance(ts_input, float): # to do, do we want to support this, ie with fractional seconds? @@ -69,6 +72,7 @@ class Timestamp(_Timestamp): # fill out rest of data ts_base.value = ts.value ts_base.offset = offset + ts_base.nanosecond = ts.dts.ps / 1000 return ts_base @@ -133,6 +137,11 @@ class Timestamp(_Timestamp): conv = tz.normalize(self) return Timestamp(conv) + def replace(self, **kwds): + return Timestamp(datetime.replace(self, **kwds), + offset=self.offset) + + cdef inline bint is_timestamp(object o): return isinstance(o, Timestamp) @@ -182,7 +191,7 @@ def apply_offset(ndarray[object] values, object offset): ndarray[int64_t] new_values object boxed - result = np.empty(n, dtype='M8[us]') + result = np.empty(n, dtype='M8[ns]') new_values = result.view('i8') pass @@ -191,10 +200,38 @@ def apply_offset(ndarray[object] values, object offset): # (see Timestamp class above). This will serve as a C extension type that # shadows the python class, where we do any heavy lifting. cdef class _Timestamp(datetime): - cdef: - int64_t value # numpy int64 + cdef readonly: + int64_t value, nanosecond object offset # frequency reference + def __richcmp__(_Timestamp self, object other, int op): + cdef _Timestamp ots + + if isinstance(other, _Timestamp): + ots = other + elif isinstance(other, datetime): + ots = Timestamp(other) + else: + if op == 2: + return False + elif op == 3: + return True + else: + raise TypeError('Cannot compare Timestamp with %s' % str(other)) + + if op == 2: # == + return self.value == ots.value + elif op == 3: # != + return self.value != ots.value + elif op == 0: # < + return self.value < ots.value + elif op == 1: # <= + return self.value <= ots.value + elif op == 4: # > + return self.value > ots.value + elif op == 5: # >= + return self.value >= ots.value + def __add__(self, other): if is_integer_object(other): if self.offset is None: @@ -204,7 +241,15 @@ cdef class _Timestamp(datetime): else: return Timestamp((self.offset.__mul__(other)).apply(self)) else: - return datetime.__add__(self, other) + if isinstance(other, timedelta) or hasattr(other, 'delta'): + nanos = _delta_to_nanoseconds(other) + return Timestamp(self.value + nanos) + else: + result = datetime.__add__(self, other) + if isinstance(result, datetime): + result = Timestamp(result) + result.nanosecond = self.nanosecond + return result def __sub__(self, other): if is_integer_object(other): @@ -217,10 +262,20 @@ cdef class _Timestamp(datetime): field) return out[0] +def _delta_to_nanoseconds(delta): + try: + delta = delta.delta + except: + pass + return (delta.days * 24 * 60 * 60 * 1000000 + + delta.seconds * 1000000 + + delta.microseconds) * 1000 + + # lightweight C object to hold datetime & int64 pair cdef class _TSObject: cdef: - npy_datetimestruct dts # npy_datetimestruct + pandas_datetimestruct dts # pandas_datetimestruct int64_t value # numpy dt64 object tzinfo @@ -247,13 +302,13 @@ cpdef convert_to_tsobject(object ts, object tz=None): if is_datetime64_object(ts): obj.value = unbox_datetime64_scalar(ts) - PyArray_DatetimeToDatetimeStruct(obj.value, NPY_FR_us, &obj.dts) + pandas_datetime_to_datetimestruct(obj.value, PANDAS_FR_ns, &obj.dts) elif is_integer_object(ts): obj.value = ts - PyArray_DatetimeToDatetimeStruct(ts, NPY_FR_us, &obj.dts) + pandas_datetime_to_datetimestruct(ts, PANDAS_FR_ns, &obj.dts) elif util.is_string_object(ts): _string_to_dts(ts, &obj.dts) - obj.value = PyArray_DatetimeStructToDatetime(NPY_FR_us, &obj.dts) + obj.value = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &obj.dts) elif PyDateTime_Check(ts): obj.value = _pydatetime_to_dts(ts, &obj.dts) obj.tzinfo = ts.tzinfo @@ -277,7 +332,7 @@ cpdef convert_to_tsobject(object ts, object tz=None): obj.value = obj.value + deltas[pos] if utc_convert: - PyArray_DatetimeToDatetimeStruct(obj.value, NPY_FR_us, + pandas_datetime_to_datetimestruct(obj.value, PANDAS_FR_ns, &obj.dts) obj.tzinfo = tz._tzinfos[inf] @@ -293,16 +348,16 @@ cpdef convert_to_tsobject(object ts, object tz=None): # obj.dtval = _dts_to_pydatetime(&obj.dts) cdef inline object _datetime64_to_datetime(int64_t val): - cdef npy_datetimestruct dts - PyArray_DatetimeToDatetimeStruct(val, NPY_FR_us, &dts) + cdef pandas_datetimestruct dts + pandas_datetime_to_datetimestruct(val, PANDAS_FR_ns, &dts) return _dts_to_pydatetime(&dts) -cdef inline object _dts_to_pydatetime(npy_datetimestruct *dts): +cdef inline object _dts_to_pydatetime(pandas_datetimestruct *dts): return PyDateTime_FromDateAndTime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us) -cdef inline int64_t _pydatetime_to_dts(object val, npy_datetimestruct *dts): +cdef inline int64_t _pydatetime_to_dts(object val, pandas_datetimestruct *dts): dts.year = PyDateTime_GET_YEAR(val) dts.month = PyDateTime_GET_MONTH(val) dts.day = PyDateTime_GET_DAY(val) @@ -310,10 +365,11 @@ cdef inline int64_t _pydatetime_to_dts(object val, npy_datetimestruct *dts): dts.min = PyDateTime_DATE_GET_MINUTE(val) dts.sec = PyDateTime_DATE_GET_SECOND(val) dts.us = PyDateTime_DATE_GET_MICROSECOND(val) - return PyArray_DatetimeStructToDatetime(NPY_FR_us, dts) + dts.ps = dts.as = 0 + return pandas_datetimestruct_to_datetime(PANDAS_FR_ns, dts) cdef inline int64_t _dtlike_to_datetime64(object val, - npy_datetimestruct *dts): + pandas_datetimestruct *dts): dts.year = val.year dts.month = val.month dts.day = val.day @@ -321,28 +377,27 @@ cdef inline int64_t _dtlike_to_datetime64(object val, dts.min = val.minute dts.sec = val.second dts.us = val.microsecond - return PyArray_DatetimeStructToDatetime(NPY_FR_us, dts) + dts.ps = dts.as = 0 + return pandas_datetimestruct_to_datetime(PANDAS_FR_ns, dts) cdef inline int64_t _date_to_datetime64(object val, - npy_datetimestruct *dts): + pandas_datetimestruct *dts): dts.year = PyDateTime_GET_YEAR(val) dts.month = PyDateTime_GET_MONTH(val) dts.day = PyDateTime_GET_DAY(val) - dts.hour = 0 - dts.min = 0 - dts.sec = 0 - dts.us = 0 - return PyArray_DatetimeStructToDatetime(NPY_FR_us, dts) + dts.hour = dts.min = dts.sec = dts.us = 0 + dts.ps = dts.as = 0 + return pandas_datetimestruct_to_datetime(PANDAS_FR_ns, dts) -cdef inline int _string_to_dts(object val, npy_datetimestruct* dts) except -1: +cdef inline int _string_to_dts(object val, pandas_datetimestruct* dts) except -1: cdef: npy_bool islocal, special - NPY_DATETIMEUNIT out_bestunit + PANDAS_DATETIMEUNIT out_bestunit if PyUnicode_Check(val): val = PyUnicode_AsASCIIString(val); - parse_iso_8601_datetime(val, len(val), NPY_FR_us, NPY_UNSAFE_CASTING, + parse_iso_8601_datetime(val, len(val), PANDAS_FR_ns, NPY_UNSAFE_CASTING, dts, &islocal, &out_bestunit, &special) return 0 @@ -734,19 +789,18 @@ def string_to_datetime(ndarray[object] strings, raise_=False, dayfirst=False): from dateutil.parser import parse - try: - result = np.empty(n, dtype='M8[us]') + result = np.empty(n, dtype='M8[ns]') iresult = result.view('i8') for i in range(n): val = strings[i] if util._checknull(val): - result[i] = NaT + iresult[i] = NaT elif PyDateTime_Check(val): result[i] = val else: if len(val) == 0: - result[i] = NaT + iresult[i] = NaT continue try: result[i] = parse(val, dayfirst=dayfirst) @@ -762,7 +816,7 @@ def string_to_datetime(ndarray[object] strings, raise_=False, dayfirst=False): oresult[i] = val else: if len(val) == 0: - oresult[i] = NaT + oresult[i] = 'NaT' continue try: oresult[i] = parse(val, dayfirst=dayfirst) @@ -901,7 +955,7 @@ def _get_transitions(tz): Get UTC times of DST transitions """ if tz not in trans_cache: - arr = np.array(tz._utc_transition_times, dtype='M8[us]') + arr = np.array(tz._utc_transition_times, dtype='M8[ns]') trans_cache[tz] = arr.view('i8') return trans_cache[tz] @@ -926,7 +980,7 @@ cpdef ndarray _unbox_utcoffsets(object transinfo): arr = np.empty(sz, dtype='i8') for i in range(sz): - arr[i] = int(total_seconds(transinfo[i][0])) * 1000000 + arr[i] = int(total_seconds(transinfo[i][0])) * 1000000000 return arr @@ -983,7 +1037,7 @@ def build_field_sarray(ndarray[int64_t] dtindex): cdef: Py_ssize_t i, count = 0 int isleap - npy_datetimestruct dts + pandas_datetimestruct dts ndarray[int32_t] years, months, days, hours, minutes, seconds, mus count = len(dtindex) @@ -1007,7 +1061,7 @@ def build_field_sarray(ndarray[int64_t] dtindex): mus = out['u'] for i in range(count): - PyArray_DatetimeToDatetimeStruct(dtindex[i], NPY_FR_us, &dts) + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) years[i] = dts.year months[i] = dts.month days[i] = dts.day @@ -1030,7 +1084,7 @@ def fast_field_accessor(ndarray[int64_t] dtindex, object field): ndarray[int32_t] out ndarray[int32_t, ndim=2] _month_offset int isleap - npy_datetimestruct dts + pandas_datetimestruct dts _month_offset = np.array( [[ 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365 ], @@ -1042,49 +1096,49 @@ def fast_field_accessor(ndarray[int64_t] dtindex, object field): if field == 'Y': for i in range(count): - PyArray_DatetimeToDatetimeStruct(dtindex[i], NPY_FR_us, &dts) + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.year return out elif field == 'M': for i in range(count): - PyArray_DatetimeToDatetimeStruct(dtindex[i], NPY_FR_us, &dts) + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.month return out elif field == 'D': for i in range(count): - PyArray_DatetimeToDatetimeStruct(dtindex[i], NPY_FR_us, &dts) + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.day return out elif field == 'h': for i in range(count): - PyArray_DatetimeToDatetimeStruct(dtindex[i], NPY_FR_us, &dts) + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.hour return out elif field == 'm': for i in range(count): - PyArray_DatetimeToDatetimeStruct(dtindex[i], NPY_FR_us, &dts) + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.min return out elif field == 's': for i in range(count): - PyArray_DatetimeToDatetimeStruct(dtindex[i], NPY_FR_us, &dts) + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.sec return out elif field == 'us': for i in range(count): - PyArray_DatetimeToDatetimeStruct(dtindex[i], NPY_FR_us, &dts) + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.us return out elif field == 'doy': for i in range(count): - PyArray_DatetimeToDatetimeStruct(dtindex[i], NPY_FR_us, &dts) + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) isleap = is_leapyear(dts.year) out[i] = _month_offset[isleap, dts.month-1] + dts.day return out @@ -1097,7 +1151,7 @@ def fast_field_accessor(ndarray[int64_t] dtindex, object field): elif field == 'woy': for i in range(count): - PyArray_DatetimeToDatetimeStruct(dtindex[i], NPY_FR_us, &dts) + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) isleap = is_leapyear(dts.year) out[i] = _month_offset[isleap, dts.month - 1] + dts.day out[i] = ((out[i] - 1) / 7) + 1 @@ -1105,7 +1159,7 @@ def fast_field_accessor(ndarray[int64_t] dtindex, object field): elif field == 'q': for i in range(count): - PyArray_DatetimeToDatetimeStruct(dtindex[i], NPY_FR_us, &dts) + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.month out[i] = ((out[i] - 1) / 3) + 1 return out @@ -1117,7 +1171,7 @@ cdef inline int m8_weekday(int64_t val): ts = convert_to_tsobject(val) return ts_dayofweek(ts) -cdef int64_t DAY_US = 86400000000LL +cdef int64_t DAY_NS = 86400000000000LL def values_at_time(ndarray[int64_t] stamps, int64_t time): cdef: @@ -1131,18 +1185,14 @@ def values_at_time(ndarray[int64_t] stamps, int64_t time): return np.empty(0, dtype=np.int64) # is this OK? - # days = stamps // DAY_US - times = stamps % DAY_US + # days = stamps // DAY_NS + times = stamps % DAY_NS - # Microsecond resolution + # Nanosecond resolution count = 0 for i in range(1, n): if times[i] == time: count += 1 - # cur = days[i] - # if cur > last: - # count += 1 - # last = cur indexer = np.empty(count, dtype=np.int64) @@ -1153,11 +1203,6 @@ def values_at_time(ndarray[int64_t] stamps, int64_t time): indexer[j] = i j += 1 - # cur = days[i] - # if cur > last: - # j += 1 - # last = cur - return indexer @@ -1165,25 +1210,25 @@ def date_normalize(ndarray[int64_t] stamps): cdef: Py_ssize_t i, n = len(stamps) ndarray[int64_t] result = np.empty(n, dtype=np.int64) - npy_datetimestruct dts + pandas_datetimestruct dts for i in range(n): - PyArray_DatetimeToDatetimeStruct(stamps[i], NPY_FR_us, &dts) + pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, &dts) dts.hour = 0 dts.min = 0 dts.sec = 0 dts.us = 0 - result[i] = PyArray_DatetimeStructToDatetime(NPY_FR_us, &dts) + result[i] = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) return result def dates_normalized(ndarray[int64_t] stamps): cdef: Py_ssize_t i, n = len(stamps) - npy_datetimestruct dts + pandas_datetimestruct dts for i in range(n): - PyArray_DatetimeToDatetimeStruct(stamps[i], NPY_FR_us, &dts) + pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, &dts) if (dts.hour + dts.min + dts.sec + dts.us) > 0: return False @@ -1233,7 +1278,7 @@ cdef inline int64_t remove_mult(int64_t period_ord_w_mult, int64_t mult): return period_ord_w_mult * mult + 1; -def dt64arr_to_periodarr(ndarray[int64_t] dtarr, int freq, int64_t mult): +def dt64arr_to_periodarr(ndarray[int64_t] dtarr, int freq): """ Convert array of datetime64 values (passed in as 'i8' dtype) to a set of periods corresponding to desired frequency, per period convention. @@ -1241,20 +1286,19 @@ def dt64arr_to_periodarr(ndarray[int64_t] dtarr, int freq, int64_t mult): cdef: ndarray[int64_t] out Py_ssize_t i, l - npy_datetimestruct dts + pandas_datetimestruct dts l = len(dtarr) out = np.empty(l, dtype='i8') for i in range(l): - PyArray_DatetimeToDatetimeStruct(dtarr[i], NPY_FR_us, &dts) + pandas_datetime_to_datetimestruct(dtarr[i], PANDAS_FR_ns, &dts) out[i] = get_period_ordinal(dts.year, dts.month, dts.day, - dts.hour, dts.min, dts.sec, freq) - out[i] = apply_mult(out[i], mult) + dts.hour, dts.min, dts.sec, freq) return out -def periodarr_to_dt64arr(ndarray[int64_t] periodarr, int freq, int64_t mult): +def periodarr_to_dt64arr(ndarray[int64_t] periodarr, int freq): """ Convert array to datetime64 values from a set of ordinals corresponding to periods per period convention. @@ -1268,15 +1312,15 @@ def periodarr_to_dt64arr(ndarray[int64_t] periodarr, int freq, int64_t mult): out = np.empty(l, dtype='i8') for i in range(l): - out[i] = period_ordinal_to_dt64(periodarr[i], freq, mult) + out[i] = period_ordinal_to_dt64(periodarr[i], freq) return out cdef char START = 'S' cdef char END = 'E' -cpdef int64_t period_asfreq(int64_t period_ordinal, int freq1, int64_t mult1, - int freq2, int64_t mult2, bint end): +cpdef int64_t period_asfreq(int64_t period_ordinal, int freq1, int freq2, + bint end): """ Convert period ordinal from one frequency to another, and if upsampling, choose to use start ('S') or end ('E') of period. @@ -1284,21 +1328,17 @@ cpdef int64_t period_asfreq(int64_t period_ordinal, int freq1, int64_t mult1, cdef: int64_t retval - period_ordinal = remove_mult(period_ordinal, mult1) - - if mult1 != 1 and end: - period_ordinal += (mult1 - 1) - if end: retval = asfreq(period_ordinal, freq1, freq2, END) else: retval = asfreq(period_ordinal, freq1, freq2, START) - retval = apply_mult(retval, mult2) + + if retval == INT32_MIN: + raise ValueError('Frequency conversion failed') return retval -def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int64_t mult1, - int freq2, int64_t mult2, bint end): +def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int freq2, bint end): """ Convert int64-array of period ordinals from one frequency to another, and if upsampling, choose to use start ('S') or end ('E') of period. @@ -1314,7 +1354,7 @@ def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int64_t mult1, n = len(arr) result = np.empty(n, dtype=np.int64) - func = get_asfreq_func(freq1, freq2, 0) + func = get_asfreq_func(freq1, freq2) get_asfreq_info(freq1, freq2, &finfo) if end: @@ -1323,32 +1363,25 @@ def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int64_t mult1, relation = START for i in range(n): - ordinal = remove_mult(arr[i], mult1) val = func(arr[i], relation, &finfo) if val == -1: raise ValueError("Unable to convert to desired frequency.") - result[i] = apply_mult(val, mult2) + result[i] = val return result -def period_ordinal(int y, int m, int d, int h, int min, int s, - int freq, int64_t mult): +def period_ordinal(int y, int m, int d, int h, int min, int s, int freq): cdef: int64_t ordinal - ordinal = get_period_ordinal(y, m, d, h, min, s, freq) + return get_period_ordinal(y, m, d, h, min, s, freq) - return apply_mult(ordinal, mult) -cpdef int64_t period_ordinal_to_dt64(int64_t period_ordinal, int freq, - int64_t mult): +cpdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq): cdef: - int64_t ordinal - npy_datetimestruct dts + pandas_datetimestruct dts date_info dinfo - ordinal = remove_mult(period_ordinal, mult) - get_date_info(ordinal, freq, &dinfo) dts.year = dinfo.year @@ -1357,131 +1390,81 @@ cpdef int64_t period_ordinal_to_dt64(int64_t period_ordinal, int freq, dts.hour = dinfo.hour dts.min = dinfo.minute dts.sec = int(dinfo.second) - dts.us = 0 + dts.us = dts.ps = 0 - return PyArray_DatetimeStructToDatetime(NPY_FR_us, &dts) + return pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) -def period_ordinal_to_string(int64_t value, int freq, int64_t mult): +def period_ordinal_to_string(int64_t value, int freq): cdef: char *ptr - ptr = period_to_string(remove_mult(value, mult), freq) + ptr = period_to_string(value, freq) if ptr == NULL: - raise ValueError("Could not create string from ordinal '%d'" % value) + raise ValueError("Could not create string from ordinal '%s'" % value) - return ptr + return ptr -def period_strftime(int64_t value, int freq, int64_t mult, object fmt): +def period_strftime(int64_t value, int freq, object fmt): cdef: char *ptr - value = remove_mult(value, mult) ptr = period_to_string2(value, freq, fmt) if ptr == NULL: raise ValueError("Could not create string with fmt '%s'" % fmt) - return ptr + return ptr # period accessors -ctypedef int (*accessor)(int64_t ordinal, int freq) except -1 - -cdef int apply_accessor(accessor func, int64_t value, int freq, - int64_t mult) except -1: - value = remove_mult(value, mult) - return func(value, freq) - -cpdef int get_period_year(int64_t value, int freq, int64_t mult) except -1: - return apply_accessor(&pyear, value, freq, mult) +ctypedef int (*accessor)(int64_t ordinal, int freq) except INT32_MIN -cpdef int get_period_qyear(int64_t value, int freq, int64_t mult) except -1: - return apply_accessor(&pqyear, value, freq, mult) +def get_period_field(int code, int64_t value, int freq): + cdef accessor f = _get_accessor_func(code) + return f(value, freq) -cpdef int get_period_quarter(int64_t value, int freq, int64_t mult) except -1: - return apply_accessor(&pquarter, value, freq, mult) - -cpdef int get_period_month(int64_t value, int freq, int64_t mult) except -1: - return apply_accessor(&pmonth, value, freq, mult) - -cpdef int get_period_day(int64_t value, int freq, int64_t mult) except -1: - return apply_accessor(&pday, value, freq, mult) - -cpdef int get_period_hour(int64_t value, int freq, int64_t mult) except -1: - return apply_accessor(&phour, value, freq, mult) - -cpdef int get_period_minute(int64_t value, int freq, int64_t mult) except -1: - return apply_accessor(&pminute, value, freq, mult) - -cpdef int get_period_second(int64_t value, int freq, int64_t mult) except -1: - return apply_accessor(&psecond, value, freq, mult) - -cpdef int get_period_dow(int64_t value, int freq, int64_t mult) except -1: - return apply_accessor(&pday_of_week, value, freq, mult) - -cpdef int get_period_week(int64_t value, int freq, int64_t mult) except -1: - return apply_accessor(&pweek, value, freq, mult) - -cpdef int get_period_weekday(int64_t value, int freq, int64_t mult) except -1: - return apply_accessor(&pweekday, value, freq, mult) - -cpdef int get_period_doy(int64_t value, int freq, int64_t mult) except -1: - return apply_accessor(&pday_of_year, value, freq, mult) - -# same but for arrays - -cdef ndarray[int64_t] apply_accessor_arr(accessor func, - ndarray[int64_t] arr, - int freq, int64_t mult): +def get_period_field_arr(int code, ndarray[int64_t] arr, int freq): cdef: Py_ssize_t i, sz ndarray[int64_t] out + accessor f + + f = _get_accessor_func(code) sz = len(arr) out = np.empty(sz, dtype=np.int64) for i in range(sz): - out[i] = remove_mult(arr[i], mult) - out[i] = func(out[i], freq) + out[i] = f(arr[i], freq) return out -def get_period_year_arr(ndarray[int64_t] arr, int freq, int64_t mult): - return apply_accessor_arr(&pyear, arr, freq, mult) - -def get_period_qyear_arr(ndarray[int64_t] arr, int freq, int64_t mult): - return apply_accessor_arr(&pqyear, arr, freq, mult) - -def get_period_quarter_arr(ndarray[int64_t] arr, int freq, int64_t mult): - return apply_accessor_arr(&pquarter, arr, freq, mult) - -def get_period_month_arr(ndarray[int64_t] arr, int freq, int64_t mult): - return apply_accessor_arr(&pmonth, arr, freq, mult) - -def get_period_day_arr(ndarray[int64_t] arr, int freq, int64_t mult): - return apply_accessor_arr(&pday, arr, freq, mult) - -def get_period_hour_arr(ndarray[int64_t] arr, int freq, int64_t mult): - return apply_accessor_arr(&phour, arr, freq, mult) -def get_period_minute_arr(ndarray[int64_t] arr, int freq, int64_t mult): - return apply_accessor_arr(&pminute, arr, freq, mult) -def get_period_second_arr(ndarray[int64_t] arr, int freq, int64_t mult): - return apply_accessor_arr(&psecond, arr, freq, mult) - -def get_period_dow_arr(ndarray[int64_t] arr, int freq, int64_t mult): - return apply_accessor_arr(&pday_of_week, arr, freq, mult) - -def get_period_week_arr(ndarray[int64_t] arr, int freq, int64_t mult): - return apply_accessor_arr(&pweek, arr, freq, mult) - -def get_period_weekday_arr(ndarray[int64_t] arr, int freq, int64_t mult): - return apply_accessor_arr(&pweekday, arr, freq, mult) - -def get_period_doy_arr(ndarray[int64_t] arr, int freq, int64_t mult): - return apply_accessor_arr(&pday_of_year, arr, freq, mult) +cdef accessor _get_accessor_func(int code): + if code == 0: + return &pyear + elif code == 1: + return &pqyear + elif code == 2: + return &pquarter + elif code == 3: + return &pmonth + elif code == 4: + return &pday + elif code == 5: + return &phour + elif code == 6: + return &pminute + elif code == 7: + return &psecond + elif code == 8: + return &pweek + elif code == 9: + return &pday_of_year + elif code == 10: + return &pweekday + else: + raise ValueError('Unrecognized code: %s' % code) -def get_abs_time(freq, dailyDate, originalDate): - return getAbsTime(freq, dailyDate, originalDate) diff --git a/pandas/src/engines.pyx b/pandas/src/engines.pyx index 07a547de8da15..d6d20aabf9bc9 100644 --- a/pandas/src/engines.pyx +++ b/pandas/src/engines.pyx @@ -12,7 +12,7 @@ cimport util import numpy as np -# import _tseries +import _algos # include "hashtable.pyx" @@ -44,11 +44,17 @@ def get_value_at(ndarray arr, object loc): def set_value_at(ndarray arr, object loc, object val): return util.set_value_at(arr, loc, val) + +# Don't populate hash tables in monotonic indexes larger than this +cdef int _SIZE_CUTOFF = 1000000 + + cdef class IndexEngine: cdef readonly: object index_weakref HashTable mapping + bint over_size_threshold cdef: bint unique, monotonic @@ -56,6 +62,9 @@ cdef class IndexEngine: def __init__(self, index_weakref): self.index_weakref = index_weakref + + self.over_size_threshold = len(index_weakref()) >= _SIZE_CUTOFF + self.initialized = 0 self.monotonic_check = 0 @@ -79,6 +88,8 @@ cdef class IndexEngine: if PySlice_Check(loc) or cnp.PyArray_Check(loc): return arr[loc] else: + if arr.descr.type_num == NPY_DATETIME: + return Timestamp(util.get_value_at(arr, loc)) return util.get_value_at(arr, loc) cpdef set_value(self, ndarray arr, object key, object value): @@ -99,6 +110,15 @@ cdef class IndexEngine: if is_definitely_invalid_key(val): raise TypeError + if self.over_size_threshold and self.is_monotonic: + if not self.is_unique: + return self._get_loc_duplicates(val) + values = self._get_index_values() + loc = values.searchsorted(val, side='left') + if util.get_value_at(values, loc) != val: + raise KeyError(val) + return loc + self._ensure_mapping_populated() if not self.unique: return self._get_loc_duplicates(val) @@ -243,14 +263,14 @@ cdef class Int64Engine(IndexEngine): return Int64HashTable(n) def _call_monotonic(self, values): - return is_monotonic_int64(values) + return _algos.is_monotonic_int64(values) def get_pad_indexer(self, other, limit=None): - return pad_int64(self._get_index_values(), other, + return _algos.pad_int64(self._get_index_values(), other, limit=limit) def get_backfill_indexer(self, other, limit=None): - return backfill_int64(self._get_index_values(), other, + return _algos.backfill_int64(self._get_index_values(), other, limit=limit) cdef _get_bool_indexer(self, object val): @@ -292,26 +312,26 @@ cdef class Float64Engine(IndexEngine): return Float64HashTable(n) def _call_monotonic(self, values): - return is_monotonic_float64(values) + return _algos.is_monotonic_float64(values) def get_pad_indexer(self, other, limit=None): - return pad_float64(self._get_index_values(), other, + return _algos.pad_float64(self._get_index_values(), other, limit=limit) def get_backfill_indexer(self, other, limit=None): - return backfill_float64(self._get_index_values(), other, + return _algos.backfill_float64(self._get_index_values(), other, limit=limit) _pad_functions = { - 'object' : pad_object, - 'int64' : pad_int64, - 'float64' : pad_float64 + 'object' : _algos.pad_object, + 'int64' : _algos.pad_int64, + 'float64' : _algos.pad_float64 } _backfill_functions = { - 'object': backfill_object, - 'int64': backfill_int64, - 'float64': backfill_float64 + 'object': _algos.backfill_object, + 'int64': _algos.backfill_int64, + 'float64': _algos.backfill_float64 } cdef class ObjectEngine(IndexEngine): @@ -322,38 +342,36 @@ cdef class ObjectEngine(IndexEngine): return PyObjectHashTable(n) def _call_monotonic(self, values): - return is_monotonic_object(values) + return _algos.is_monotonic_object(values) def get_pad_indexer(self, other, limit=None): - return pad_object(self._get_index_values(), other, + return _algos.pad_object(self._get_index_values(), other, limit=limit) def get_backfill_indexer(self, other, limit=None): - return backfill_object(self._get_index_values(), other, + return _algos.backfill_object(self._get_index_values(), other, limit=limit) cdef class DatetimeEngine(Int64Engine): - # cdef Int64HashTable mapping - def __contains__(self, object val): - self._ensure_mapping_populated() - - if util.is_datetime64_object(val): - return val.view('i8') in self.mapping - - if PyDateTime_Check(val): - key = np.datetime64(val) - return key.view('i8') in self.mapping + if self.over_size_threshold and self.is_monotonic: + if not self.is_unique: + return self._get_loc_duplicates(val) + values = self._get_index_values() + conv = _to_i8(val) + loc = values.searchsorted(conv, side='left') + return util.get_value_at(values, loc) == conv - return val in self.mapping + self._ensure_mapping_populated() + return _to_i8(val) in self.mapping cdef _get_index_values(self): return self.index_weakref().values.view('i8') def _call_monotonic(self, values): - return is_monotonic_int64(values) + return _algos.is_monotonic_int64(values) cpdef get_loc(self, object val): if is_definitely_invalid_key(val): @@ -361,13 +379,19 @@ cdef class DatetimeEngine(Int64Engine): # Welcome to the spaghetti factory + if self.over_size_threshold and self.is_monotonic: + if not self.is_unique: + return self._get_loc_duplicates(val) + values = self._get_index_values() + conv = _to_i8(val) + loc = values.searchsorted(conv, side='left') + if util.get_value_at(values, loc) != conv: + raise KeyError(val) + return loc + self._ensure_mapping_populated() if not self.unique: - if util.is_datetime64_object(val): - val = val.view('i8') - elif PyDateTime_Check(val): - val = np.datetime64(val) - val = val.view('i8') + val = _to_i8(val) return self._get_loc_duplicates(val) try: @@ -378,11 +402,7 @@ cdef class DatetimeEngine(Int64Engine): pass try: - if util.is_datetime64_object(val): - val = val.view('i8') - elif PyDateTime_Check(val): - val = np.datetime64(val) - val = val.view('i8') + val = _to_i8(val) return self.mapping.get_item(val) except TypeError: self._date_check_type(val) @@ -395,25 +415,33 @@ cdef class DatetimeEngine(Int64Engine): def get_indexer(self, values): self._ensure_mapping_populated() - if values.dtype != 'M8': + if values.dtype != 'M8[ns]': return np.repeat(-1, len(values)).astype('i4') values = np.asarray(values).view('i8') return self.mapping.lookup(values) def get_pad_indexer(self, other, limit=None): - if other.dtype != 'M8': + if other.dtype != 'M8[ns]': return np.repeat(-1, len(other)).astype('i4') other = np.asarray(other).view('i8') - return pad_int64(self._get_index_values(), other, - limit=limit) + return _algos.pad_int64(self._get_index_values(), other, + limit=limit) def get_backfill_indexer(self, other, limit=None): - if other.dtype != 'M8': + if other.dtype != 'M8[ns]': return np.repeat(-1, len(other)).astype('i4') other = np.asarray(other).view('i8') - return backfill_int64(self._get_index_values(), other, - limit=limit) + return _algos.backfill_int64(self._get_index_values(), other, + limit=limit) + +cdef inline _to_i8(object val): + cdef pandas_datetimestruct dts + if util.is_datetime64_object(val): + val = unbox_datetime64_scalar(val) + elif PyDateTime_Check(val): + return _pydatetime_to_dts(val, &dts) + return val # ctypedef fused idxvalue_t: # object diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py index ee151b6ebc8ef..77c4469958632 100644 --- a/pandas/src/generate_code.py +++ b/pandas/src/generate_code.py @@ -1,4 +1,65 @@ +import os from pandas.util.py3compat import StringIO +from pandas.src.codegen_template import template as pyx_template +from pandas.src.codegen_replace import replace + +header = """ +cimport numpy as np +cimport cython + +from numpy cimport * + +from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, + PyDict_Contains, PyDict_Keys, + Py_INCREF, PyTuple_SET_ITEM, + PyTuple_SetItem, + PyTuple_New) +from cpython cimport PyFloat_Check +cimport cpython + +import numpy as np +isnan = np.isnan +cdef double NaN = np.NaN +cdef double nan = NaN + +from datetime import datetime as pydatetime + +# this is our datetime.pxd +from datetime cimport * + +from khash cimport * + +cdef inline int int_max(int a, int b): return a if a >= b else b +cdef inline int int_min(int a, int b): return a if a <= b else b + +ctypedef unsigned char UChar + +cimport util +from util cimport is_array, _checknull, _checknan + +cdef extern from "math.h": + double sqrt(double x) + double fabs(double) + +# import datetime C API +PyDateTime_IMPORT + +# initialize numpy +import_array() +import_ufunc() + +cdef int PLATFORM_INT = ( np.arange(0, dtype=np.int_)).descr.type_num + +cpdef ensure_platform_int(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == PLATFORM_INT: + return arr + else: + return arr.astype(np.int_) + else: + return np.array(arr, dtype=np.int_) + +""" take_1d_template = """@cython.wraparound(False) @cython.boundscheck(False) @@ -540,6 +601,8 @@ def arrmap_%(name)s(ndarray[%(c_type)s] index, object func): cdef ndarray[object] result = np.empty(length, dtype=np.object_) + from _tseries import maybe_convert_objects + for i in range(length): result[i] = func(index[i]) @@ -762,6 +825,35 @@ def outer_join_indexer_%(name)s(ndarray[%(c_type)s] left, """ +# ensure_dtype functions + +ensure_dtype_template = """ +cpdef ensure_%(name)s(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_%(ctype)s: + return arr + else: + return arr.astype(np.%(dtype)s) + else: + return np.array(arr, dtype=np.%(dtype)s) + +""" + +ensure_functions = [ + ('float64', 'FLOAT64', 'float64'), + ('int32', 'INT32', 'int32'), + ('int64', 'INT64', 'int64'), + # ('platform_int', 'INT', 'int_'), + ('object', 'OBJECT', 'object_'), +] + +def generate_ensure_dtypes(): + output = StringIO() + for name, ctype, dtype in ensure_functions: + filled = ensure_dtype_template % locals() + output.write(filled) + return output.getvalue() + #---------------------------------------------------------------------- # Fast "put" logic for speeding up interleaving logic @@ -778,6 +870,10 @@ def put2d_%(name)s_%(dest_type)s(ndarray[%(c_type)s, ndim=2, cast=True] values, out[i] = values[j, loc] """ + +#------------------------------------------------------------------------- +# Generators + def generate_put_functions(): function_list = [ ('float64', 'float64_t', 'object'), @@ -843,14 +939,10 @@ def generate_from_template(template, ndim=1, exclude=None): take_2d_axis1_template, take_2d_multi_template] - -# templates_1d_datetime = [take_1d_template] -# templates_2d_datetime = [take_2d_axis0_template, -# take_2d_axis1_template] - - def generate_take_cython_file(path='generated.pyx'): with open(path, 'w') as f: + print >> f, header + for template in templates_1d: print >> f, generate_from_template(template) @@ -866,7 +958,5 @@ def generate_take_cython_file(path='generated.pyx'): for template in nobool_1d_templates: print >> f, generate_from_template(template, exclude=['bool']) - # print >> f, generate_put_functions() - if __name__ == '__main__': generate_take_cython_file() diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx index ed5f12791abf4..9a275c806c0eb 100644 --- a/pandas/src/generated.pyx +++ b/pandas/src/generated.pyx @@ -1,3 +1,60 @@ + +cimport numpy as np +cimport cython + +from numpy cimport * + +from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, + PyDict_Contains, PyDict_Keys, + Py_INCREF, PyTuple_SET_ITEM, + PyTuple_SetItem, + PyTuple_New) +from cpython cimport PyFloat_Check +cimport cpython + +import numpy as np +isnan = np.isnan +cdef double NaN = np.NaN +cdef double nan = NaN + +from datetime import datetime as pydatetime + +# this is our datetime.pxd +from datetime cimport * + +from khash cimport * + +cdef inline int int_max(int a, int b): return a if a >= b else b +cdef inline int int_min(int a, int b): return a if a <= b else b + +ctypedef unsigned char UChar + +cimport util +from util cimport is_array, _checknull, _checknan + +cdef extern from "math.h": + double sqrt(double x) + double fabs(double) + +# import datetime C API +PyDateTime_IMPORT + +# initialize numpy +import_array() +import_ufunc() + +cdef int PLATFORM_INT = ( np.arange(0, dtype=np.int_)).descr.type_num + +cpdef ensure_platform_int(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == PLATFORM_INT: + return arr + else: + return arr.astype(np.int_) + else: + return np.array(arr, dtype=np.int_) + + @cython.wraparound(False) @cython.boundscheck(False) cpdef map_indices_float64(ndarray[float64_t] index): @@ -1751,6 +1808,8 @@ def arrmap_float64(ndarray[float64_t] index, object func): cdef ndarray[object] result = np.empty(length, dtype=np.object_) + from _tseries import maybe_convert_objects + for i in range(length): result[i] = func(index[i]) @@ -1764,6 +1823,8 @@ def arrmap_object(ndarray[object] index, object func): cdef ndarray[object] result = np.empty(length, dtype=np.object_) + from _tseries import maybe_convert_objects + for i in range(length): result[i] = func(index[i]) @@ -1777,6 +1838,8 @@ def arrmap_int32(ndarray[int32_t] index, object func): cdef ndarray[object] result = np.empty(length, dtype=np.object_) + from _tseries import maybe_convert_objects + for i in range(length): result[i] = func(index[i]) @@ -1790,6 +1853,8 @@ def arrmap_int64(ndarray[int64_t] index, object func): cdef ndarray[object] result = np.empty(length, dtype=np.object_) + from _tseries import maybe_convert_objects + for i in range(length): result[i] = func(index[i]) @@ -1803,6 +1868,8 @@ def arrmap_bool(ndarray[uint8_t] index, object func): cdef ndarray[object] result = np.empty(length, dtype=np.object_) + from _tseries import maybe_convert_objects + for i in range(length): result[i] = func(index[i]) @@ -3251,3 +3318,44 @@ def inner_join_indexer_int64(ndarray[int64_t] left, return result, lindexer, rindexer + +cpdef ensure_float64(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_FLOAT64: + return arr + else: + return arr.astype(np.float64) + else: + return np.array(arr, dtype=np.float64) + + +cpdef ensure_int32(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_INT32: + return arr + else: + return arr.astype(np.int32) + else: + return np.array(arr, dtype=np.int32) + + +cpdef ensure_int64(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_INT64: + return arr + else: + return arr.astype(np.int64) + else: + return np.array(arr, dtype=np.int64) + + +cpdef ensure_object(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_OBJECT: + return arr + else: + return arr.astype(np.object_) + else: + return np.array(arr, dtype=np.object_) + + diff --git a/pandas/src/groupby.pyx b/pandas/src/groupby.pyx index 049f70b5f8237..78c3b0ff3f11a 100644 --- a/pandas/src/groupby.pyx +++ b/pandas/src/groupby.pyx @@ -330,6 +330,188 @@ def group_prod(ndarray[float64_t, ndim=2] out, else: out[i, j] = prodx[i, j] +#---------------------------------------------------------------------- +# first, nth, last + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels, int64_t rank): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, count + ndarray[float64_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth_bin(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins, int64_t rank): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, count + ndarray[float64_t, ndim=2] resx, nobs + + nobs = np.zeros_like(out) + resx = np.empty_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + if nobs[b, j] == rank: + resx[b, j] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_last(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, count + ndarray[float64_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_last_bin(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, count + ndarray[float64_t, ndim=2] resx, nobs + + nobs = np.zeros_like(out) + resx = np.empty_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + resx[b, j] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + +#---------------------------------------------------------------------- +# group_min, group_max + @cython.boundscheck(False) @cython.wraparound(False) @@ -564,7 +746,6 @@ def group_var(ndarray[float64_t, ndim=2] out, @cython.boundscheck(False) @cython.wraparound(False) - def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner, object closed='left'): """ @@ -787,6 +968,7 @@ def group_min_bin(ndarray[float64_t, ndim=2] out, else: out[i, j] = minx[i, j] + @cython.boundscheck(False) @cython.wraparound(False) def group_max_bin(ndarray[float64_t, ndim=2] out, @@ -924,8 +1106,8 @@ def group_ohlc(ndarray[float64_t, ndim=2] out, out[b, 3] = vclose -# @cython.boundscheck(False) -# @cython.wraparound(False) +@cython.boundscheck(False) +@cython.wraparound(False) def group_mean_bin(ndarray[float64_t, ndim=2] out, ndarray[int64_t] counts, ndarray[float64_t, ndim=2] values, @@ -1085,62 +1267,6 @@ def lookup_values(ndarray[object] values, dict mapping): result[i] = mapping[values[i]] return maybe_convert_objects(result) -def reduce_mean(ndarray[object] indices, - ndarray[object] buckets, - ndarray[float64_t] values, - inclusive=False): - cdef: - Py_ssize_t i, j, nbuckets, nvalues - ndarray[float64_t] output - float64_t the_sum, val, nobs - - - - nbuckets = len(buckets) - nvalues = len(indices) - - assert(len(values) == len(indices)) - - output = np.empty(nbuckets, dtype=float) - output.fill(np.NaN) - - j = 0 - for i from 0 <= i < nbuckets: - next_bound = buckets[i] - the_sum = 0 - nobs = 0 - if inclusive: - while j < nvalues and indices[j] <= next_bound: - val = values[j] - # not NaN - if val == val: - the_sum += val - nobs += 1 - j += 1 - else: - while j < nvalues and indices[j] < next_bound: - val = values[j] - # not NaN - if val == val: - the_sum += val - nobs += 1 - j += 1 - - if nobs > 0: - output[i] = the_sum / nobs - - if j >= nvalues: - break - - return output - -def _bucket_locs(index, buckets, inclusive=False): - if inclusive: - locs = index.searchsorted(buckets, side='left') - else: - locs = index.searchsorted(buckets, side='right') - - return locs def count_level_1d(ndarray[uint8_t, cast=True] mask, ndarray[int64_t] labels, Py_ssize_t max_bin): @@ -1158,6 +1284,7 @@ def count_level_1d(ndarray[uint8_t, cast=True] mask, return counts + def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, ndarray[int64_t] labels, Py_ssize_t max_bin): cdef: @@ -1174,7 +1301,69 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, return counts -def duplicated(list values, take_last=False): +cdef class _PandasNull: + + def __richcmp__(_PandasNull self, object other, int op): + if op == 2: # == + return isinstance(other, _PandasNull) + elif op == 3: # != + return not isinstance(other, _PandasNull) + else: + return False + + def __hash__(self): + return 0 + +pandas_null = _PandasNull() + +def fast_zip_fillna(list ndarrays, fill_value=pandas_null): + ''' + For zipping multiple ndarrays into an ndarray of tuples + ''' + cdef: + Py_ssize_t i, j, k, n + ndarray[object] result + flatiter it + object val, tup + + k = len(ndarrays) + n = len(ndarrays[0]) + + result = np.empty(n, dtype=object) + + # initialize tuples on first pass + arr = ndarrays[0] + it = PyArray_IterNew(arr) + for i in range(n): + val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) + tup = PyTuple_New(k) + + if val != val: + val = fill_value + + PyTuple_SET_ITEM(tup, 0, val) + Py_INCREF(val) + result[i] = tup + PyArray_ITER_NEXT(it) + + for j in range(1, k): + arr = ndarrays[j] + it = PyArray_IterNew(arr) + if len(arr) != n: + raise ValueError('all arrays must be same length') + + for i in range(n): + val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) + if val != val: + val = fill_value + + PyTuple_SET_ITEM(result[i], j, val) + Py_INCREF(val) + PyArray_ITER_NEXT(it) + + return result + +def duplicated(ndarray[object] values, take_last=False): cdef: Py_ssize_t i, n dict seen = {} @@ -1186,6 +1375,7 @@ def duplicated(list values, take_last=False): if take_last: for i from n > i >= 0: row = values[i] + if row in seen: result[i] = 1 else: @@ -1202,7 +1392,6 @@ def duplicated(list values, take_last=False): return result.view(np.bool_) - def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups): cdef: Py_ssize_t i, group_size, n, lab, start @@ -1228,7 +1417,7 @@ def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups): return starts, ends -def groupby_arrays(ndarray index, ndarray[int64_t] labels): +def groupby_arrays(ndarray index, ndarray[int64_t] labels, sort=True): cdef: Py_ssize_t i, lab, cur, start, n = len(index) dict result = {} @@ -1236,10 +1425,11 @@ def groupby_arrays(ndarray index, ndarray[int64_t] labels): index = np.asarray(index) # this is N log N. If this is a bottleneck may we worth fixing someday - indexer = labels.argsort(kind='mergesort') + if sort: + indexer = labels.argsort(kind='mergesort') - labels = labels.take(indexer) - index = index.take(indexer) + labels = labels.take(indexer) + index = index.take(indexer) if n == 0: return result @@ -1255,4 +1445,45 @@ def groupby_arrays(ndarray index, ndarray[int64_t] labels): start = i cur = lab + result[cur] = index[start:] + return result + +def indices_fast(object index, ndarray[int64_t] labels, list keys, + list sorted_labels): + cdef: + Py_ssize_t i, j, k, lab, cur, start, n = len(labels) + dict result = {} + object tup + + k = len(keys) + + if n == 0: + return result + + start = 0 + cur = labels[0] + for i in range(1, n): + lab = labels[i] + + if lab != cur: + if lab != -1: + tup = PyTuple_New(k) + for j in range(k): + val = util.get_value_at(keys[j], + sorted_labels[j][i-1]) + PyTuple_SET_ITEM(tup, j, val) + Py_INCREF(val) + + result[tup] = index[start:i] + start = i + cur = lab + + tup = PyTuple_New(k) + for j in range(k): + val = util.get_value_at(keys[j], + sorted_labels[j][n - 1]) + PyTuple_SET_ITEM(tup, j, val) + Py_INCREF(val) + result[tup] = index[start:] + return result diff --git a/pandas/src/hashtable.pyx b/pandas/src/hashtable.pyx index d6a5b3a442c7e..fea622449b47c 100644 --- a/pandas/src/hashtable.pyx +++ b/pandas/src/hashtable.pyx @@ -823,9 +823,10 @@ cdef class Int64Factorizer: def get_count(self): return self.count - def factorize(self, ndarray[int64_t] values, sort=False): + def factorize(self, ndarray[int64_t] values, sort=False, + na_sentinel=-1): labels, counts = self.table.get_labels(values, self.uniques, - self.count, -1) + self.count, na_sentinel) # sort on if sort: diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 3b23de6eabf8b..87fbb7076880e 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -11,7 +11,7 @@ _TYPE_MAP = { np.uint64: 'integer', np.float32: 'floating', np.float64: 'floating', - np.complex64: 'complex', + np.complex128: 'complex', np.complex128: 'complex', np.string_: 'string', np.unicode_: 'unicode', @@ -223,31 +223,37 @@ def maybe_convert_numeric(ndarray[object] values, set na_values): cdef: Py_ssize_t i, n ndarray[float64_t] floats + ndarray[complex128_t] complexes ndarray[int64_t] ints bint seen_float = 0 + bint seen_complex = 0 object val float64_t fval n = len(values) floats = np.empty(n, dtype='f8') + complexes = np.empty(n, dtype='c16') ints = np.empty(n, dtype='i8') for i from 0 <= i < n: val = values[i] if util.is_float_object(val): - floats[i] = val + floats[i] = complexes[i] = val seen_float = 1 elif val in na_values: - floats[i] = nan + floats[i] = complexes[i] = nan seen_float = 1 elif val is None: - floats[i] = nan + floats[i] = complexes[i] = nan seen_float = 1 elif len(val) == 0: - floats[i] = nan + floats[i] = complexes[i] = nan seen_float = 1 + elif util.is_complex_object(val): + complexes[i] = val + seen_complex = 1 else: fval = util.floatify(val) floats[i] = fval @@ -257,7 +263,9 @@ def maybe_convert_numeric(ndarray[object] values, set na_values): else: ints[i] = fval - if seen_float: + if seen_complex: + return complexes + elif seen_float: return floats else: return ints @@ -270,9 +278,11 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, cdef: Py_ssize_t i, n ndarray[float64_t] floats + ndarray[complex128_t] complexes ndarray[int64_t] ints ndarray[uint8_t] bools bint seen_float = 0 + bint seen_complex = 0 bint seen_int = 0 bint seen_bool = 0 bint seen_object = 0 @@ -283,6 +293,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, n = len(objects) floats = np.empty(n, dtype='f8') + complexes = np.empty(n, dtype='c16') ints = np.empty(n, dtype='i8') bools = np.empty(n, dtype=np.uint8) @@ -294,7 +305,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if val is None: seen_null = 1 - floats[i] = fnan + floats[i] = complexes[i] = fnan elif util.is_bool_object(val): seen_bool = 1 bools[i] = val @@ -305,15 +316,20 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, elif util.is_integer_object(val): seen_int = 1 floats[i] = val + complexes[i] = val if not seen_null: ints[i] = val elif util.is_float_object(val): - floats[i] = val + floats[i] = complexes[i] = val seen_float = 1 + elif util.is_complex_object(val): + complexes[i] = val + seen_complex = 1 elif try_float and not util.is_string_object(val): # this will convert Decimal objects try: floats[i] = float(val) + complexes[i] = complex(val) seen_float = 1 except Exception: seen_object = 1 @@ -323,14 +339,19 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if not safe: if seen_null: if (seen_float or seen_int) and not seen_object: - return floats + if seen_complex: + return complexes + else: + return floats else: return objects else: if seen_object: return objects elif not seen_bool: - if seen_float: + if seen_complex: + return complexes + elif seen_float: return floats elif seen_int: return ints @@ -343,7 +364,10 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, # don't cast int to float, etc. if seen_null: if (seen_float or seen_int) and not seen_object: - return floats + if seen_complex: + return complexes + else: + return floats else: return objects else: @@ -352,6 +376,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, elif not seen_bool: if seen_int and seen_float: return objects + elif seen_complex: + return complexes elif seen_float: return floats elif seen_int: @@ -465,15 +491,13 @@ def map_infer(ndarray arr, object f): ''' cdef: Py_ssize_t i, n - flatiter it ndarray[object] result object val - it = PyArray_IterNew(arr) n = len(arr) result = np.empty(n, dtype=object) for i in range(n): - val = f(PyArray_GETITEM(arr, PyArray_ITER_DATA(it))) + val = f(util.get_value_at(arr, i)) # unbox 0-dim arrays, GH #690 if is_array(val) and PyArray_NDIM(val) == 0: @@ -482,9 +506,6 @@ def map_infer(ndarray arr, object f): result[i] = val - - PyArray_ITER_NEXT(it) - return maybe_convert_objects(result, try_float=0) def to_object_array(list rows): diff --git a/pandas/src/join.pyx b/pandas/src/join.pyx index 502635012ad39..06d00fe2e16f7 100644 --- a/pandas/src/join.pyx +++ b/pandas/src/join.pyx @@ -118,8 +118,9 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, return left_indexer, right_indexer + def full_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, - Py_ssize_t max_groups): + Py_ssize_t max_groups): cdef: Py_ssize_t i, j, k, count = 0 ndarray[int64_t] left_count, right_count, left_sorter, right_sorter @@ -143,8 +144,8 @@ def full_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, # group 0 is the NA group cdef: - Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 - Py_ssize_t offset + int64_t left_pos = 0, right_pos = 0 + Py_ssize_t offset, position = 0 # exclude the NA group left_pos = left_count[0] @@ -180,6 +181,8 @@ def full_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, return (_get_result_indexer(left_sorter, left_indexer), _get_result_indexer(right_sorter, right_indexer)) + + def _get_result_indexer(sorter, indexer): if indexer.dtype != np.int_: indexer = indexer.astype(np.int_) @@ -188,6 +191,51 @@ def _get_result_indexer(sorter, indexer): return res + +def ffill_indexer(ndarray[int64_t] indexer): + cdef: + Py_ssize_t i, n = len(indexer) + ndarray[int64_t] result + int64_t val, last_obs + + result = np.empty(n, dtype=np.int64) + last_obs = -1 + + for i in range(n): + val = indexer[i] + if val == -1: + result[i] = last_obs + else: + result[i] = val + last_obs = val + + return result + + +def ffill_by_group(ndarray[int64_t] indexer, ndarray[int64_t] group_ids, + int64_t max_group): + cdef: + Py_ssize_t i, n = len(indexer) + ndarray[int64_t] result, last_obs + int64_t gid, val + + result = np.empty(n, dtype=np.int64) + + last_obs = np.empty(max_group, dtype=np.int64) + last_obs.fill(-1) + + for i in range(n): + gid = group_ids[i] + val = indexer[i] + if val == -1: + result[i] = last_obs[gid] + else: + result[i] = val + last_obs[gid] = val + + return result + + @cython.boundscheck(False) @cython.wraparound(False) def join_sorter(ndarray[int64_t] index, Py_ssize_t ngroups): diff --git a/pandas/src/moments.pyx b/pandas/src/moments.pyx index 6bf644cf9ac78..0e9c05de1abb4 100644 --- a/pandas/src/moments.pyx +++ b/pandas/src/moments.pyx @@ -72,6 +72,45 @@ def median(ndarray arr): return (kth_smallest(arr, n / 2) + kth_smallest(arr, n / 2 - 1)) / 2 +# -------------- Min, Max subsequence + +def max_subseq(ndarray[double_t] arr): + cdef: + Py_ssize_t i=0,s=0,e=0,T,n + double m, S + + n = len(arr) + + if len(arr) == 0: + return (-1,-1,None) + + m = arr[0] + S = m + T = 0 + + for i in range(1, n): + # S = max { S + A[i], A[i] ) + if (S > 0): + S = S + arr[i] + else: + S = arr[i] + T = i + if S > m: + s = T + e = i + m = S + + return (s, e, m) + +def min_subseq(ndarray[double_t] arr): + cdef: + Py_ssize_t s, e + double m + + (s, e, m) = max_subseq(-arr) + + return (s, e, -m) + #------------------------------------------------------------------------------- # Rolling sum diff --git a/pandas/src/np_datetime.c b/pandas/src/np_datetime.c index 521f964cf86db..06b7b8abd8661 100644 --- a/pandas/src/np_datetime.c +++ b/pandas/src/np_datetime.c @@ -63,7 +63,7 @@ int dayofweek(int y, int m, int d) * the current values are valid. */ void -add_minutes_to_datetimestruct(npy_datetimestruct *dts, int minutes) +add_minutes_to_datetimestruct(pandas_datetimestruct *dts, int minutes) { int isleap; @@ -115,7 +115,7 @@ add_minutes_to_datetimestruct(npy_datetimestruct *dts, int minutes) * Calculates the days offset from the 1970 epoch. */ npy_int64 -get_datetimestruct_days(const npy_datetimestruct *dts) +get_datetimestruct_days(const pandas_datetimestruct *dts) { int i, month; npy_int64 year, days = 0; @@ -221,7 +221,7 @@ days_to_yearsdays(npy_int64 *days_) * the current values are valid. */ NPY_NO_EXPORT void -add_seconds_to_datetimestruct(npy_datetimestruct *dts, int seconds) +add_seconds_to_datetimestruct(pandas_datetimestruct *dts, int seconds) { int minutes; @@ -247,7 +247,7 @@ add_seconds_to_datetimestruct(npy_datetimestruct *dts, int seconds) * offset from 1970. */ static void -set_datetimestruct_days(npy_int64 days, npy_datetimestruct *dts) +set_datetimestruct_days(npy_int64 days, pandas_datetimestruct *dts) { int *month_lengths, i; @@ -269,7 +269,7 @@ set_datetimestruct_days(npy_int64 days, npy_datetimestruct *dts) /* * * Tests for and converts a Python datetime.datetime or datetime.date - * object into a NumPy npy_datetimestruct. + * object into a NumPy pandas_datetimestruct. * * While the C API has PyDate_* and PyDateTime_* functions, the following * implementation just asks for attributes, and thus supports @@ -286,15 +286,15 @@ set_datetimestruct_days(npy_int64 days, npy_datetimestruct *dts) * if obj doesn't have the neeeded date or datetime attributes. */ int -convert_pydatetime_to_datetimestruct(PyObject *obj, npy_datetimestruct *out, - NPY_DATETIMEUNIT *out_bestunit, +convert_pydatetime_to_datetimestruct(PyObject *obj, pandas_datetimestruct *out, + PANDAS_DATETIMEUNIT *out_bestunit, int apply_tzinfo) { PyObject *tmp; int isleap; /* Initialize the output to all zeros */ - memset(out, 0, sizeof(npy_datetimestruct)); + memset(out, 0, sizeof(pandas_datetimestruct)); out->month = 1; out->day = 1; @@ -358,7 +358,7 @@ convert_pydatetime_to_datetimestruct(PyObject *obj, npy_datetimestruct *out, !PyObject_HasAttrString(obj, "microsecond")) { /* The best unit for date is 'D' */ if (out_bestunit != NULL) { - *out_bestunit = NPY_FR_D; + *out_bestunit = PANDAS_FR_D; } return 0; } @@ -463,7 +463,7 @@ convert_pydatetime_to_datetimestruct(PyObject *obj, npy_datetimestruct *out, /* The resolution of Python's datetime is 'us' */ if (out_bestunit != NULL) { - *out_bestunit = NPY_FR_us; + *out_bestunit = PANDAS_FR_us; } return 0; @@ -482,6 +482,29 @@ convert_pydatetime_to_datetimestruct(PyObject *obj, npy_datetimestruct *out, return -1; } +npy_datetime pandas_datetimestruct_to_datetime(PANDAS_DATETIMEUNIT fr, pandas_datetimestruct *d) +{ + pandas_datetime_metadata meta; + npy_datetime result = PANDAS_DATETIME_NAT; + + meta.base = fr; + meta.num = 1; + + convert_datetimestruct_to_datetime(&meta, d, &result); + return result; +} + +void pandas_datetime_to_datetimestruct(npy_datetime val, PANDAS_DATETIMEUNIT fr, + pandas_datetimestruct *result) +{ + pandas_datetime_metadata meta; + + meta.base = fr; + meta.num = 1; + + convert_datetime_to_datetimestruct(&meta, val, result); +} + /* * Converts a datetime from a datetimestruct to a datetime based * on some metadata. The date is assumed to be valid. @@ -491,18 +514,18 @@ convert_pydatetime_to_datetimestruct(PyObject *obj, npy_datetimestruct *out, * Returns 0 on success, -1 on failure. */ int -convert_datetimestruct_to_datetime(PyArray_DatetimeMetaData *meta, - const npy_datetimestruct *dts, +convert_datetimestruct_to_datetime(pandas_datetime_metadata *meta, + const pandas_datetimestruct *dts, npy_datetime *out) { npy_datetime ret; - NPY_DATETIMEUNIT base = meta->base; + PANDAS_DATETIMEUNIT base = meta->base; - if (base == NPY_FR_Y) { + if (base == PANDAS_FR_Y) { /* Truncate to the year */ ret = dts->year - 1970; } - else if (base == NPY_FR_M) { + else if (base == PANDAS_FR_M) { /* Truncate to the month */ ret = 12 * (dts->year - 1970) + (dts->month - 1); } @@ -511,7 +534,7 @@ convert_datetimestruct_to_datetime(PyArray_DatetimeMetaData *meta, npy_int64 days = get_datetimestruct_days(dts); switch (base) { - case NPY_FR_W: + case PANDAS_FR_W: /* Truncate to weeks */ if (days >= 0) { ret = days / 7; @@ -520,39 +543,39 @@ convert_datetimestruct_to_datetime(PyArray_DatetimeMetaData *meta, ret = (days - 6) / 7; } break; - case NPY_FR_D: + case PANDAS_FR_D: ret = days; break; - case NPY_FR_h: + case PANDAS_FR_h: ret = days * 24 + dts->hour; break; - case NPY_FR_m: + case PANDAS_FR_m: ret = (days * 24 + dts->hour) * 60 + dts->min; break; - case NPY_FR_s: + case PANDAS_FR_s: ret = ((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec; break; - case NPY_FR_ms: + case PANDAS_FR_ms: ret = (((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec) * 1000 + dts->us / 1000; break; - case NPY_FR_us: + case PANDAS_FR_us: ret = (((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec) * 1000000 + dts->us; break; - case NPY_FR_ns: + case PANDAS_FR_ns: ret = ((((days * 24 + dts->hour) * 60 + dts->min) * 60 + @@ -560,7 +583,7 @@ convert_datetimestruct_to_datetime(PyArray_DatetimeMetaData *meta, dts->us) * 1000 + dts->ps / 1000; break; - case NPY_FR_ps: + case PANDAS_FR_ps: ret = ((((days * 24 + dts->hour) * 60 + dts->min) * 60 + @@ -568,7 +591,7 @@ convert_datetimestruct_to_datetime(PyArray_DatetimeMetaData *meta, dts->us) * 1000000 + dts->ps; break; - case NPY_FR_fs: + case PANDAS_FR_fs: /* only 2.6 hours */ ret = (((((days * 24 + dts->hour) * 60 + @@ -578,7 +601,7 @@ convert_datetimestruct_to_datetime(PyArray_DatetimeMetaData *meta, dts->ps) * 1000 + dts->as / 1000; break; - case NPY_FR_as: + case PANDAS_FR_as: /* only 9.2 secs */ ret = (((((days * 24 + dts->hour) * 60 + @@ -619,8 +642,8 @@ convert_datetimestruct_to_datetime(PyArray_DatetimeMetaData *meta, * months units, and all the other units. */ npy_bool -can_cast_timedelta64_units(NPY_DATETIMEUNIT src_unit, - NPY_DATETIMEUNIT dst_unit, +can_cast_timedelta64_units(PANDAS_DATETIMEUNIT src_unit, + PANDAS_DATETIMEUNIT dst_unit, NPY_CASTING casting) { switch (casting) { @@ -633,8 +656,8 @@ can_cast_timedelta64_units(NPY_DATETIMEUNIT src_unit, * 'same_kind' casting. */ case NPY_SAME_KIND_CASTING: - return (src_unit <= NPY_FR_M && dst_unit <= NPY_FR_M) || - (src_unit > NPY_FR_M && dst_unit > NPY_FR_M); + return (src_unit <= PANDAS_FR_M && dst_unit <= PANDAS_FR_M) || + (src_unit > PANDAS_FR_M && dst_unit > PANDAS_FR_M); /* * Enforce the 'date units' vs 'time units' barrier and that @@ -643,8 +666,8 @@ can_cast_timedelta64_units(NPY_DATETIMEUNIT src_unit, */ case NPY_SAFE_CASTING: return (src_unit <= dst_unit) && - ((src_unit <= NPY_FR_M && dst_unit <= NPY_FR_M) || - (src_unit > NPY_FR_M && dst_unit > NPY_FR_M)); + ((src_unit <= PANDAS_FR_M && dst_unit <= PANDAS_FR_M) || + (src_unit > PANDAS_FR_M && dst_unit > PANDAS_FR_M)); /* Enforce equality with 'no' or 'equiv' casting */ default: @@ -659,8 +682,8 @@ can_cast_timedelta64_units(NPY_DATETIMEUNIT src_unit, * for all but 'unsafe' casting. */ npy_bool -can_cast_datetime64_units(NPY_DATETIMEUNIT src_unit, - NPY_DATETIMEUNIT dst_unit, +can_cast_datetime64_units(PANDAS_DATETIMEUNIT src_unit, + PANDAS_DATETIMEUNIT dst_unit, NPY_CASTING casting) { switch (casting) { @@ -673,8 +696,8 @@ can_cast_datetime64_units(NPY_DATETIMEUNIT src_unit, * 'same_kind' casting. */ case NPY_SAME_KIND_CASTING: - return (src_unit <= NPY_FR_D && dst_unit <= NPY_FR_D) || - (src_unit > NPY_FR_D && dst_unit > NPY_FR_D); + return (src_unit <= PANDAS_FR_D && dst_unit <= PANDAS_FR_D) || + (src_unit > PANDAS_FR_D && dst_unit > PANDAS_FR_D); /* * Enforce the 'date units' vs 'time units' barrier and that @@ -683,8 +706,8 @@ can_cast_datetime64_units(NPY_DATETIMEUNIT src_unit, */ case NPY_SAFE_CASTING: return (src_unit <= dst_unit) && - ((src_unit <= NPY_FR_D && dst_unit <= NPY_FR_D) || - (src_unit > NPY_FR_D && dst_unit > NPY_FR_D)); + ((src_unit <= PANDAS_FR_D && dst_unit <= PANDAS_FR_D) || + (src_unit > PANDAS_FR_D && dst_unit > PANDAS_FR_D)); /* Enforce equality with 'no' or 'equiv' casting */ default: @@ -696,14 +719,14 @@ can_cast_datetime64_units(NPY_DATETIMEUNIT src_unit, * Converts a datetime based on the given metadata into a datetimestruct */ int -convert_datetime_to_datetimestruct(PyArray_DatetimeMetaData *meta, +convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, npy_datetime dt, - npy_datetimestruct *out) + pandas_datetimestruct *out) { npy_int64 perday; /* Initialize the output to all zeros */ - memset(out, 0, sizeof(npy_datetimestruct)); + memset(out, 0, sizeof(pandas_datetimestruct)); out->year = 1970; out->month = 1; out->day = 1; @@ -716,11 +739,11 @@ convert_datetime_to_datetimestruct(PyArray_DatetimeMetaData *meta, * for negative values. */ switch (meta->base) { - case NPY_FR_Y: + case PANDAS_FR_Y: out->year = 1970 + dt; break; - case NPY_FR_M: + case PANDAS_FR_M: if (dt >= 0) { out->year = 1970 + dt / 12; out->month = dt % 12 + 1; @@ -731,16 +754,16 @@ convert_datetime_to_datetimestruct(PyArray_DatetimeMetaData *meta, } break; - case NPY_FR_W: + case PANDAS_FR_W: /* A week is 7 days */ set_datetimestruct_days(dt * 7, out); break; - case NPY_FR_D: + case PANDAS_FR_D: set_datetimestruct_days(dt, out); break; - case NPY_FR_h: + case PANDAS_FR_h: perday = 24LL; if (dt >= 0) { @@ -754,7 +777,7 @@ convert_datetime_to_datetimestruct(PyArray_DatetimeMetaData *meta, out->hour = dt; break; - case NPY_FR_m: + case PANDAS_FR_m: perday = 24LL * 60; if (dt >= 0) { @@ -769,7 +792,7 @@ convert_datetime_to_datetimestruct(PyArray_DatetimeMetaData *meta, out->min = dt % 60; break; - case NPY_FR_s: + case PANDAS_FR_s: perday = 24LL * 60 * 60; if (dt >= 0) { @@ -785,7 +808,7 @@ convert_datetime_to_datetimestruct(PyArray_DatetimeMetaData *meta, out->sec = dt % 60; break; - case NPY_FR_ms: + case PANDAS_FR_ms: perday = 24LL * 60 * 60 * 1000; if (dt >= 0) { @@ -802,7 +825,7 @@ convert_datetime_to_datetimestruct(PyArray_DatetimeMetaData *meta, out->us = (dt % 1000LL) * 1000; break; - case NPY_FR_us: + case PANDAS_FR_us: perday = 24LL * 60LL * 60LL * 1000LL * 1000LL; if (dt >= 0) { @@ -819,7 +842,7 @@ convert_datetime_to_datetimestruct(PyArray_DatetimeMetaData *meta, out->us = dt % 1000000LL; break; - case NPY_FR_ns: + case PANDAS_FR_ns: perday = 24LL * 60LL * 60LL * 1000LL * 1000LL * 1000LL; if (dt >= 0) { @@ -837,7 +860,7 @@ convert_datetime_to_datetimestruct(PyArray_DatetimeMetaData *meta, out->ps = (dt % 1000LL) * 1000; break; - case NPY_FR_ps: + case PANDAS_FR_ps: perday = 24LL * 60 * 60 * 1000 * 1000 * 1000 * 1000; if (dt >= 0) { @@ -855,7 +878,7 @@ convert_datetime_to_datetimestruct(PyArray_DatetimeMetaData *meta, out->ps = dt % 1000000LL; break; - case NPY_FR_fs: + case PANDAS_FR_fs: /* entire range is only +- 2.6 hours */ if (dt >= 0) { out->hour = dt / (60*60*1000000000000000LL); @@ -883,7 +906,7 @@ convert_datetime_to_datetimestruct(PyArray_DatetimeMetaData *meta, } break; - case NPY_FR_as: + case PANDAS_FR_as: /* entire range is only +- 9.2 seconds */ if (dt >= 0) { out->sec = (dt / 1000000000000000000LL) % 60; diff --git a/pandas/src/np_datetime.h b/pandas/src/np_datetime.h index 29598e9262b21..042ea11d015e9 100644 --- a/pandas/src/np_datetime.h +++ b/pandas/src/np_datetime.h @@ -6,15 +6,51 @@ #ifndef _PANDAS_DATETIME_H_ #define _PANDAS_DATETIME_H_ -#define NPY_DATETIME_MAX_ISO8601_STRLEN (21+3*5+1+3*6+6+1) +typedef enum { + PANDAS_FR_Y, /* Years */ + PANDAS_FR_M, /* Months */ + PANDAS_FR_W, /* Weeks */ + PANDAS_FR_D, /* Days */ + PANDAS_FR_B, /* Business days */ + PANDAS_FR_h, /* hours */ + PANDAS_FR_m, /* minutes */ + PANDAS_FR_s, /* seconds */ + PANDAS_FR_ms,/* milliseconds */ + PANDAS_FR_us,/* microseconds */ + PANDAS_FR_ns,/* nanoseconds */ + PANDAS_FR_ps,/* picoseconds */ + PANDAS_FR_fs,/* femtoseconds */ + PANDAS_FR_as,/* attoseconds */ +} PANDAS_DATETIMEUNIT; + +#define PANDAS_DATETIME_NUMUNITS 14 + +#define PANDAS_DATETIME_MAX_ISO8601_STRLEN (21+3*5+1+3*6+6+1) + +#define PANDAS_DATETIME_NAT NPY_MIN_INT64 + +typedef struct { + npy_int64 year; + npy_int32 month, day, hour, min, sec, us, ps, as; +} pandas_datetimestruct; + +typedef struct { + PANDAS_DATETIMEUNIT base; + int num; +} pandas_datetime_metadata; // stuff pandas needs // ---------------------------------------------------------------------------- -int convert_pydatetime_to_datetimestruct(PyObject *obj, npy_datetimestruct *out, - NPY_DATETIMEUNIT *out_bestunit, +int convert_pydatetime_to_datetimestruct(PyObject *obj, pandas_datetimestruct *out, + PANDAS_DATETIMEUNIT *out_bestunit, int apply_tzinfo); +npy_datetime pandas_datetimestruct_to_datetime(PANDAS_DATETIMEUNIT fr, pandas_datetimestruct *d); + +void pandas_datetime_to_datetimestruct(npy_datetime val, PANDAS_DATETIMEUNIT fr, + pandas_datetimestruct *result); + int dayofweek(int y, int m, int d); static int _days_per_month_table[2][12] = { @@ -22,7 +58,7 @@ static int _days_per_month_table[2][12] = { { 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 } }; -// stuff numpy needs in header +// stuff numpy-derived code needs in header // ---------------------------------------------------------------------------- int is_leapyear(npy_int64 year); @@ -36,22 +72,22 @@ int is_leapyear(npy_int64 year); * Returns 0 on success, -1 on failure. */ int -convert_datetimestruct_to_datetime(PyArray_DatetimeMetaData *meta, - const npy_datetimestruct *dts, +convert_datetimestruct_to_datetime(pandas_datetime_metadata *meta, + const pandas_datetimestruct *dts, npy_datetime *out); /* * Calculates the days offset from the 1970 epoch. */ npy_int64 -get_datetimestruct_days(const npy_datetimestruct *dts); +get_datetimestruct_days(const pandas_datetimestruct *dts); /* * Adjusts a datetimestruct based on a minutes offset. Assumes * the current values are valid. */ void -add_minutes_to_datetimestruct(npy_datetimestruct *dts, int minutes); +add_minutes_to_datetimestruct(pandas_datetimestruct *dts, int minutes); /* * This provides the casting rules for the TIMEDELTA data type units. @@ -60,19 +96,21 @@ add_minutes_to_datetimestruct(npy_datetimestruct *dts, int minutes); * months units, and all the other units. */ //npy_bool -//can_cast_timedelta64_units(NPY_DATETIMEUNIT src_unit, -// NPY_DATETIMEUNIT dst_unit, +//can_cast_timedelta64_units(PANDAS_DATETIMEUNIT src_unit, +// PANDAS_DATETIMEUNIT dst_unit, // NPY_CASTING casting); npy_bool -can_cast_datetime64_units(NPY_DATETIMEUNIT src_unit, - NPY_DATETIMEUNIT dst_unit, +can_cast_datetime64_units(PANDAS_DATETIMEUNIT src_unit, + PANDAS_DATETIMEUNIT dst_unit, NPY_CASTING casting); int -convert_datetime_to_datetimestruct(PyArray_DatetimeMetaData *meta, - npy_datetime dt, - npy_datetimestruct *out); +convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, + npy_datetime dt, + pandas_datetimestruct *out); + + #endif diff --git a/pandas/src/np_datetime_strings.c b/pandas/src/np_datetime_strings.c index 1224ce16c953c..15ea534935c9e 100644 --- a/pandas/src/np_datetime_strings.c +++ b/pandas/src/np_datetime_strings.c @@ -57,21 +57,21 @@ typedef time_t NPY_TIME_T; /*}*/ /* Exported as DATETIMEUNITS in multiarraymodule.c */ -static char *_datetime_strings[NPY_DATETIME_NUMUNITS] = { - NPY_STR_Y, - NPY_STR_M, - NPY_STR_W, - NPY_STR_D, - NPY_STR_h, - NPY_STR_m, - NPY_STR_s, - NPY_STR_ms, - NPY_STR_us, - NPY_STR_ns, - NPY_STR_ps, - NPY_STR_fs, - NPY_STR_as, - "generic" +static char *_datetime_strings[PANDAS_DATETIME_NUMUNITS] = { + "Y", + "M", + "W", + "D", + "B", + "h", + "m", + "s", + "ms", + "us", + "ns", + "ps", + "fs", + "as", }; /* * Wraps `localtime` functionality for multiple platforms. This @@ -170,8 +170,8 @@ get_gmtime(NPY_TIME_T *ts, struct tm *tms) * Returns 0 on success, -1 on failure. */ static int -convert_datetimestruct_utc_to_local(npy_datetimestruct *out_dts_local, - const npy_datetimestruct *dts_utc, int *out_timezone_offset) +convert_datetimestruct_utc_to_local(pandas_datetimestruct *out_dts_local, + const pandas_datetimestruct *dts_utc, int *out_timezone_offset) { NPY_TIME_T rawtime = 0, localrawtime; struct tm tm_; @@ -197,7 +197,7 @@ convert_datetimestruct_utc_to_local(npy_datetimestruct *out_dts_local, /* * Convert everything in 'dts' to a time_t, to minutes precision. * This is POSIX time, which skips leap-seconds, but because - * we drop the seconds value from the npy_datetimestruct, everything + * we drop the seconds value from the pandas_datetimestruct, everything * is ok for this operation. */ rawtime = (time_t)get_datetimestruct_days(out_dts_local) * 24 * 60 * 60; @@ -236,8 +236,8 @@ convert_datetimestruct_utc_to_local(npy_datetimestruct *out_dts_local, * Returns 0 on success, -1 on failure. */ static int -convert_datetimestruct_local_to_utc(npy_datetimestruct *out_dts_utc, - const npy_datetimestruct *dts_local) +convert_datetimestruct_local_to_utc(pandas_datetimestruct *out_dts_utc, + const pandas_datetimestruct *dts_local) { npy_int64 year_correction = 0; @@ -306,11 +306,11 @@ convert_datetimestruct_local_to_utc(npy_datetimestruct *out_dts_utc, } /* int */ -/* parse_python_string(PyObject* obj, npy_datetimestruct *dts) { */ +/* parse_python_string(PyObject* obj, pandas_datetimestruct *dts) { */ /* PyObject *bytes = NULL; */ /* char *str = NULL; */ /* Py_ssize_t len = 0; */ -/* NPY_DATETIMEUNIT bestunit = -1; */ +/* PANDAS_DATETIMEUNIT bestunit = -1; */ /* /\* Convert to an ASCII string for the date parser *\/ */ /* if (PyUnicode_Check(obj)) { */ @@ -329,7 +329,7 @@ convert_datetimestruct_local_to_utc(npy_datetimestruct *out_dts_utc, /* } */ /* /\* Parse the ISO date *\/ */ -/* if (parse_iso_8601_datetime(str, len, NPY_FR_us, NPY_UNSAFE_CASTING, */ +/* if (parse_iso_8601_datetime(str, len, PANDAS_FR_us, NPY_UNSAFE_CASTING, */ /* dts, NULL, &bestunit, NULL) < 0) { */ /* Py_DECREF(bytes); */ /* return -1; */ @@ -377,20 +377,20 @@ convert_datetimestruct_local_to_utc(npy_datetimestruct *out_dts_utc, */ int parse_iso_8601_datetime(char *str, int len, - NPY_DATETIMEUNIT unit, + PANDAS_DATETIMEUNIT unit, NPY_CASTING casting, - npy_datetimestruct *out, + pandas_datetimestruct *out, npy_bool *out_local, - NPY_DATETIMEUNIT *out_bestunit, + PANDAS_DATETIMEUNIT *out_bestunit, npy_bool *out_special) { int year_leap = 0; int i, numdigits; char *substr, sublen; - NPY_DATETIMEUNIT bestunit; + PANDAS_DATETIMEUNIT bestunit; /* Initialize the output to all zeros */ - memset(out, 0, sizeof(npy_datetimestruct)); + memset(out, 0, sizeof(pandas_datetimestruct)); out->month = 1; out->day = 1; @@ -420,7 +420,7 @@ parse_iso_8601_datetime(char *str, int len, out->month = tm_.tm_mon + 1; out->day = tm_.tm_mday; - bestunit = NPY_FR_D; + bestunit = PANDAS_FR_D; /* * Indicate that this was a special value, and @@ -454,15 +454,15 @@ parse_iso_8601_datetime(char *str, int len, tolower(str[1]) == 'o' && tolower(str[2]) == 'w') { NPY_TIME_T rawtime = 0; - PyArray_DatetimeMetaData meta; + pandas_datetime_metadata meta; time(&rawtime); /* Set up a dummy metadata for the conversion */ - meta.base = NPY_FR_s; + meta.base = PANDAS_FR_s; meta.num = 1; - bestunit = NPY_FR_s; + bestunit = PANDAS_FR_s; /* * Indicate that this was a special value, and @@ -536,7 +536,7 @@ parse_iso_8601_datetime(char *str, int len, if (out_local != NULL) { *out_local = 0; } - bestunit = NPY_FR_Y; + bestunit = PANDAS_FR_Y; goto finish; } else if (*substr == '-') { @@ -573,7 +573,7 @@ parse_iso_8601_datetime(char *str, int len, if (out_local != NULL) { *out_local = 0; } - bestunit = NPY_FR_M; + bestunit = PANDAS_FR_M; goto finish; } else if (*substr == '-') { @@ -611,7 +611,7 @@ parse_iso_8601_datetime(char *str, int len, if (out_local != NULL) { *out_local = 0; } - bestunit = NPY_FR_D; + bestunit = PANDAS_FR_D; goto finish; } else if (*substr != 'T' && *substr != ' ') { @@ -644,7 +644,7 @@ parse_iso_8601_datetime(char *str, int len, --sublen; } else { - bestunit = NPY_FR_h; + bestunit = PANDAS_FR_h; goto parse_timezone; } @@ -675,7 +675,7 @@ parse_iso_8601_datetime(char *str, int len, --sublen; } else { - bestunit = NPY_FR_m; + bestunit = PANDAS_FR_m; goto parse_timezone; } @@ -706,7 +706,7 @@ parse_iso_8601_datetime(char *str, int len, --sublen; } else { - bestunit = NPY_FR_s; + bestunit = PANDAS_FR_s; goto parse_timezone; } @@ -724,10 +724,10 @@ parse_iso_8601_datetime(char *str, int len, if (sublen == 0 || !isdigit(*substr)) { if (numdigits > 3) { - bestunit = NPY_FR_us; + bestunit = PANDAS_FR_us; } else { - bestunit = NPY_FR_ms; + bestunit = PANDAS_FR_ms; } goto parse_timezone; } @@ -746,10 +746,10 @@ parse_iso_8601_datetime(char *str, int len, if (sublen == 0 || !isdigit(*substr)) { if (numdigits > 3) { - bestunit = NPY_FR_ps; + bestunit = PANDAS_FR_ps; } else { - bestunit = NPY_FR_ns; + bestunit = PANDAS_FR_ns; } goto parse_timezone; } @@ -767,10 +767,10 @@ parse_iso_8601_datetime(char *str, int len, } if (numdigits > 3) { - bestunit = NPY_FR_as; + bestunit = PANDAS_FR_as; } else { - bestunit = NPY_FR_fs; + bestunit = PANDAS_FR_fs; } parse_timezone: @@ -911,54 +911,44 @@ parse_iso_8601_datetime(char *str, int len, * objects with the given local and unit settings. */ int -get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) +get_datetime_iso_8601_strlen(int local, PANDAS_DATETIMEUNIT base) { int len = 0; /* If no unit is provided, return the maximum length */ if (base == -1) { - return NPY_DATETIME_MAX_ISO8601_STRLEN; + return PANDAS_DATETIME_MAX_ISO8601_STRLEN; } switch (base) { /* Generic units can only be used to represent NaT */ - /*case NPY_FR_GENERIC:*/ + /*case PANDAS_FR_GENERIC:*/ /* return 4;*/ - case NPY_FR_as: + case PANDAS_FR_as: len += 3; /* "###" */ - break; - case NPY_FR_fs: + case PANDAS_FR_fs: len += 3; /* "###" */ - break; - case NPY_FR_ps: + case PANDAS_FR_ps: len += 3; /* "###" */ - break; - case NPY_FR_ns: + case PANDAS_FR_ns: len += 3; /* "###" */ - break; - case NPY_FR_us: + case PANDAS_FR_us: len += 3; /* "###" */ - break; - case NPY_FR_ms: + case PANDAS_FR_ms: len += 4; /* ".###" */ - break; - case NPY_FR_s: + case PANDAS_FR_s: len += 3; /* ":##" */ - break; - case NPY_FR_m: + case PANDAS_FR_m: len += 3; /* ":##" */ - break; - case NPY_FR_h: + case PANDAS_FR_h: len += 3; /* "T##" */ - break; - case NPY_FR_D: - case NPY_FR_W: + case PANDAS_FR_D: + case PANDAS_FR_B: + case PANDAS_FR_W: len += 3; /* "-##" */ - break; - case NPY_FR_M: + case PANDAS_FR_M: len += 3; /* "-##" */ - break; - case NPY_FR_Y: + case PANDAS_FR_Y: len += 21; /* 64-bit year */ break; default: @@ -966,7 +956,7 @@ get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) break; } - if (base >= NPY_FR_h) { + if (base >= PANDAS_FR_h) { if (local) { len += 5; /* "+####" or "-####" */ } @@ -984,49 +974,49 @@ get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) * Finds the largest unit whose value is nonzero, and for which * the remainder for the rest of the units is zero. */ -static NPY_DATETIMEUNIT -lossless_unit_from_datetimestruct(npy_datetimestruct *dts) +static PANDAS_DATETIMEUNIT +lossless_unit_from_datetimestruct(pandas_datetimestruct *dts) { if (dts->as % 1000 != 0) { - return NPY_FR_as; + return PANDAS_FR_as; } else if (dts->as != 0) { - return NPY_FR_fs; + return PANDAS_FR_fs; } else if (dts->ps % 1000 != 0) { - return NPY_FR_ps; + return PANDAS_FR_ps; } else if (dts->ps != 0) { - return NPY_FR_ns; + return PANDAS_FR_ns; } else if (dts->us % 1000 != 0) { - return NPY_FR_us; + return PANDAS_FR_us; } else if (dts->us != 0) { - return NPY_FR_ms; + return PANDAS_FR_ms; } else if (dts->sec != 0) { - return NPY_FR_s; + return PANDAS_FR_s; } else if (dts->min != 0) { - return NPY_FR_m; + return PANDAS_FR_m; } else if (dts->hour != 0) { - return NPY_FR_h; + return PANDAS_FR_h; } else if (dts->day != 1) { - return NPY_FR_D; + return PANDAS_FR_D; } else if (dts->month != 1) { - return NPY_FR_M; + return PANDAS_FR_M; } else { - return NPY_FR_Y; + return PANDAS_FR_Y; } } /* - * Converts an npy_datetimestruct to an (almost) ISO 8601 + * Converts an pandas_datetimestruct to an (almost) ISO 8601 * NULL-terminated string. If the string fits in the space exactly, * it leaves out the NULL terminator and returns success. * @@ -1052,11 +1042,11 @@ lossless_unit_from_datetimestruct(npy_datetimestruct *dts) * string was too short). */ int -make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, - int local, NPY_DATETIMEUNIT base, int tzoffset, +make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, + int local, PANDAS_DATETIMEUNIT base, int tzoffset, NPY_CASTING casting) { - npy_datetimestruct dts_local; + pandas_datetimestruct dts_local; int timezone_offset = 0; char *substr = outstr, sublen = outlen; @@ -1074,12 +1064,12 @@ make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, * If there's a timezone, use at least minutes precision, * and never split up hours and minutes by default */ - if ((base < NPY_FR_m && local) || base == NPY_FR_h) { - base = NPY_FR_m; + if ((base < PANDAS_FR_m && local) || base == PANDAS_FR_h) { + base = PANDAS_FR_m; } /* Don't split up dates by default */ - else if (base < NPY_FR_D) { - base = NPY_FR_D; + else if (base < PANDAS_FR_D) { + base = PANDAS_FR_D; } } /* @@ -1088,8 +1078,8 @@ make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, * TODO: Could print weeks with YYYY-Www format if the week * epoch is a Monday. */ - else if (base == NPY_FR_W) { - base = NPY_FR_D; + else if (base == PANDAS_FR_W) { + base = PANDAS_FR_D; } /* Use the C API to convert from UTC to local time */ @@ -1104,7 +1094,7 @@ make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, } /* Use the manually provided tzoffset */ else if (local) { - /* Make a copy of the npy_datetimestruct we can modify */ + /* Make a copy of the pandas_datetimestruct we can modify */ dts_local = *dts; dts = &dts_local; @@ -1120,7 +1110,7 @@ make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, */ if (casting != NPY_UNSAFE_CASTING) { /* Producing a date as a local time is always 'unsafe' */ - if (base <= NPY_FR_D && local) { + if (base <= PANDAS_FR_D && local) { PyErr_SetString(PyExc_TypeError, "Cannot create a local " "timezone-based date string from a NumPy " "datetime without forcing 'unsafe' casting"); @@ -1128,7 +1118,7 @@ make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, } /* Only 'unsafe' and 'same_kind' allow data loss */ else { - NPY_DATETIMEUNIT unitprec; + PANDAS_DATETIMEUNIT unitprec; unitprec = lossless_unit_from_datetimestruct(dts); if (casting != NPY_SAME_KIND_CASTING && unitprec > base) { @@ -1163,7 +1153,7 @@ make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, sublen -= tmplen; /* Stop if the unit is years */ - if (base == NPY_FR_Y) { + if (base == PANDAS_FR_Y) { if (sublen > 0) { *substr = '\0'; } @@ -1187,7 +1177,7 @@ make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, sublen -= 3; /* Stop if the unit is months */ - if (base == NPY_FR_M) { + if (base == PANDAS_FR_M) { if (sublen > 0) { *substr = '\0'; } @@ -1211,7 +1201,7 @@ make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, sublen -= 3; /* Stop if the unit is days */ - if (base == NPY_FR_D) { + if (base == PANDAS_FR_D) { if (sublen > 0) { *substr = '\0'; } @@ -1235,7 +1225,7 @@ make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, sublen -= 3; /* Stop if the unit is hours */ - if (base == NPY_FR_h) { + if (base == PANDAS_FR_h) { goto add_time_zone; } @@ -1256,7 +1246,7 @@ make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, sublen -= 3; /* Stop if the unit is minutes */ - if (base == NPY_FR_m) { + if (base == PANDAS_FR_m) { goto add_time_zone; } @@ -1277,7 +1267,7 @@ make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, sublen -= 3; /* Stop if the unit is seconds */ - if (base == NPY_FR_s) { + if (base == PANDAS_FR_s) { goto add_time_zone; } @@ -1302,7 +1292,7 @@ make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, sublen -= 4; /* Stop if the unit is milliseconds */ - if (base == NPY_FR_ms) { + if (base == PANDAS_FR_ms) { goto add_time_zone; } @@ -1323,7 +1313,7 @@ make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, sublen -= 3; /* Stop if the unit is microseconds */ - if (base == NPY_FR_us) { + if (base == PANDAS_FR_us) { goto add_time_zone; } @@ -1344,7 +1334,7 @@ make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, sublen -= 3; /* Stop if the unit is nanoseconds */ - if (base == NPY_FR_ns) { + if (base == PANDAS_FR_ns) { goto add_time_zone; } @@ -1365,7 +1355,7 @@ make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, sublen -= 3; /* Stop if the unit is picoseconds */ - if (base == NPY_FR_ps) { + if (base == PANDAS_FR_ps) { goto add_time_zone; } @@ -1386,7 +1376,7 @@ make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, sublen -= 3; /* Stop if the unit is femtoseconds */ - if (base == NPY_FR_fs) { + if (base == PANDAS_FR_fs) { goto add_time_zone; } diff --git a/pandas/src/np_datetime_strings.h b/pandas/src/np_datetime_strings.h index 0226d0aaccad6..9a2488fefaf56 100644 --- a/pandas/src/np_datetime_strings.h +++ b/pandas/src/np_datetime_strings.h @@ -42,11 +42,11 @@ */ int parse_iso_8601_datetime(char *str, int len, - NPY_DATETIMEUNIT unit, + PANDAS_DATETIMEUNIT unit, NPY_CASTING casting, - npy_datetimestruct *out, + pandas_datetimestruct *out, npy_bool *out_local, - NPY_DATETIMEUNIT *out_bestunit, + PANDAS_DATETIMEUNIT *out_bestunit, npy_bool *out_special); /* @@ -54,10 +54,10 @@ parse_iso_8601_datetime(char *str, int len, * objects with the given local and unit settings. */ int -get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base); +get_datetime_iso_8601_strlen(int local, PANDAS_DATETIMEUNIT base); /* - * Converts an npy_datetimestruct to an (almost) ISO 8601 + * Converts an pandas_datetimestruct to an (almost) ISO 8601 * NULL-terminated string. * * If 'local' is non-zero, it produces a string in local time with @@ -79,8 +79,8 @@ get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base); * string was too short). */ int -make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, - int local, NPY_DATETIMEUNIT base, int tzoffset, +make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, + int local, PANDAS_DATETIMEUNIT base, int tzoffset, NPY_CASTING casting); #endif diff --git a/pandas/src/numpy.pxd b/pandas/src/numpy.pxd new file mode 100644 index 0000000000000..45c2fc184a911 --- /dev/null +++ b/pandas/src/numpy.pxd @@ -0,0 +1,980 @@ +# NumPy static imports for Cython +# +# If any of the PyArray_* functions are called, import_array must be +# called first. +# +# This also defines backwards-compatability buffer acquisition +# code for use in Python 2.x (or Python <= 2.5 when NumPy starts +# implementing PEP-3118 directly). +# +# Because of laziness, the format string of the buffer is statically +# allocated. Increase the size if this is not enough, or submit a +# patch to do this properly. +# +# Author: Dag Sverre Seljebotn +# + +DEF _buffer_format_string_len = 255 + +cimport cpython.buffer as pybuf +from cpython.ref cimport Py_INCREF, Py_XDECREF +from cpython.object cimport PyObject +cimport libc.stdlib as stdlib +cimport libc.stdio as stdio + +cdef extern from "Python.h": + ctypedef int Py_intptr_t + +cdef extern from "numpy/arrayobject.h": + ctypedef Py_intptr_t npy_intp + ctypedef size_t npy_uintp + + cdef enum NPY_TYPES: + NPY_BOOL + NPY_BYTE + NPY_UBYTE + NPY_SHORT + NPY_USHORT + NPY_INT + NPY_UINT + NPY_LONG + NPY_ULONG + NPY_LONGLONG + NPY_ULONGLONG + NPY_FLOAT + NPY_DOUBLE + NPY_LONGDOUBLE + NPY_CFLOAT + NPY_CDOUBLE + NPY_CLONGDOUBLE + NPY_OBJECT + NPY_STRING + NPY_UNICODE + NPY_VOID + NPY_NTYPES + NPY_NOTYPE + + NPY_INT8 + NPY_INT16 + NPY_INT32 + NPY_INT64 + NPY_INT128 + NPY_INT256 + NPY_UINT8 + NPY_UINT16 + NPY_UINT32 + NPY_UINT64 + NPY_UINT128 + NPY_UINT256 + NPY_FLOAT16 + NPY_FLOAT32 + NPY_FLOAT64 + NPY_FLOAT80 + NPY_FLOAT96 + NPY_FLOAT128 + NPY_FLOAT256 + NPY_COMPLEX32 + NPY_COMPLEX64 + NPY_COMPLEX128 + NPY_COMPLEX160 + NPY_COMPLEX192 + NPY_COMPLEX256 + NPY_COMPLEX512 + + NPY_DATETIME + + NPY_INTP + + ctypedef enum NPY_ORDER: + NPY_ANYORDER + NPY_CORDER + NPY_FORTRANORDER + + ctypedef enum NPY_CLIPMODE: + NPY_CLIP + NPY_WRAP + NPY_RAISE + + ctypedef enum NPY_SCALARKIND: + NPY_NOSCALAR, + NPY_BOOL_SCALAR, + NPY_INTPOS_SCALAR, + NPY_INTNEG_SCALAR, + NPY_FLOAT_SCALAR, + NPY_COMPLEX_SCALAR, + NPY_OBJECT_SCALAR + + ctypedef enum NPY_SORTKIND: + NPY_QUICKSORT + NPY_HEAPSORT + NPY_MERGESORT + + ctypedef enum NPY_SEARCHSIDE: + NPY_SEARCHLEFT + NPY_SEARCHRIGHT + + enum: + NPY_C_CONTIGUOUS + NPY_F_CONTIGUOUS + NPY_CONTIGUOUS + NPY_FORTRAN + NPY_OWNDATA + NPY_FORCECAST + NPY_ENSURECOPY + NPY_ENSUREARRAY + NPY_ELEMENTSTRIDES + NPY_ALIGNED + NPY_NOTSWAPPED + NPY_WRITEABLE + NPY_UPDATEIFCOPY + NPY_ARR_HAS_DESCR + + NPY_BEHAVED + NPY_BEHAVED_NS + NPY_CARRAY + NPY_CARRAY_RO + NPY_FARRAY + NPY_FARRAY_RO + NPY_DEFAULT + + NPY_IN_ARRAY + NPY_OUT_ARRAY + NPY_INOUT_ARRAY + NPY_IN_FARRAY + NPY_OUT_FARRAY + NPY_INOUT_FARRAY + + NPY_UPDATE_ALL + + cdef enum: + NPY_MAXDIMS + + npy_intp NPY_MAX_ELSIZE + + ctypedef void (*PyArray_VectorUnaryFunc)(void *, void *, npy_intp, void *, void *) + + ctypedef class numpy.dtype [object PyArray_Descr]: + # Use PyDataType_* macros when possible, however there are no macros + # for accessing some of the fields, so some are defined. Please + # ask on cython-dev if you need more. + cdef int type_num + cdef int itemsize "elsize" + cdef char byteorder + cdef object fields + cdef tuple names + + ctypedef extern class numpy.flatiter [object PyArrayIterObject]: + # Use through macros + pass + + ctypedef extern class numpy.broadcast [object PyArrayMultiIterObject]: + # Use through macros + pass + + ctypedef struct PyArrayObject: + # For use in situations where ndarray can't replace PyArrayObject*, + # like PyArrayObject**. + pass + + ctypedef class numpy.ndarray [object PyArrayObject]: + cdef __cythonbufferdefaults__ = {"mode": "strided"} + + cdef: + # Only taking a few of the most commonly used and stable fields. + # One should use PyArray_* macros instead to access the C fields. + char *data + int ndim "nd" + npy_intp *shape "dimensions" + npy_intp *strides + dtype descr + PyObject* base + + # Note: This syntax (function definition in pxd files) is an + # experimental exception made for __getbuffer__ and __releasebuffer__ + # -- the details of this may change. + def __getbuffer__(ndarray self, Py_buffer* info, int flags): + # This implementation of getbuffer is geared towards Cython + # requirements, and does not yet fullfill the PEP. + # In particular strided access is always provided regardless + # of flags + + if info == NULL: return + + cdef int copy_shape, i, ndim + cdef int endian_detector = 1 + cdef bint little_endian = ((&endian_detector)[0] != 0) + + ndim = PyArray_NDIM(self) + + if sizeof(npy_intp) != sizeof(Py_ssize_t): + copy_shape = 1 + else: + copy_shape = 0 + + if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS) + and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)): + raise ValueError(u"ndarray is not C contiguous") + + if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS) + and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)): + raise ValueError(u"ndarray is not Fortran contiguous") + + info.buf = PyArray_DATA(self) + info.ndim = ndim + if copy_shape: + # Allocate new buffer for strides and shape info. + # This is allocated as one block, strides first. + info.strides = stdlib.malloc(sizeof(Py_ssize_t) * ndim * 2) + info.shape = info.strides + ndim + for i in range(ndim): + info.strides[i] = PyArray_STRIDES(self)[i] + info.shape[i] = PyArray_DIMS(self)[i] + else: + info.strides = PyArray_STRIDES(self) + info.shape = PyArray_DIMS(self) + info.suboffsets = NULL + info.itemsize = PyArray_ITEMSIZE(self) + info.readonly = not PyArray_ISWRITEABLE(self) + + cdef int t + cdef char* f = NULL + cdef dtype descr = self.descr + cdef list stack + cdef int offset + + cdef bint hasfields = PyDataType_HASFIELDS(descr) + + if not hasfields and not copy_shape: + # do not call releasebuffer + info.obj = None + else: + # need to call releasebuffer + info.obj = self + + if not hasfields: + t = descr.type_num + if ((descr.byteorder == '>' and little_endian) or + (descr.byteorder == '<' and not little_endian)): + raise ValueError(u"Non-native byte order not supported") + if t == NPY_BYTE: f = "b" + elif t == NPY_UBYTE: f = "B" + elif t == NPY_SHORT: f = "h" + elif t == NPY_USHORT: f = "H" + elif t == NPY_INT: f = "i" + elif t == NPY_UINT: f = "I" + elif t == NPY_LONG: f = "l" + elif t == NPY_ULONG: f = "L" + elif t == NPY_LONGLONG: f = "q" + elif t == NPY_ULONGLONG: f = "Q" + elif t == NPY_FLOAT: f = "f" + elif t == NPY_DOUBLE: f = "d" + elif t == NPY_LONGDOUBLE: f = "g" + elif t == NPY_CFLOAT: f = "Zf" + elif t == NPY_CDOUBLE: f = "Zd" + elif t == NPY_CLONGDOUBLE: f = "Zg" + elif t == NPY_OBJECT: f = "O" + else: + raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t) + info.format = f + return + else: + info.format = stdlib.malloc(_buffer_format_string_len) + info.format[0] = '^' # Native data types, manual alignment + offset = 0 + f = _util_dtypestring(descr, info.format + 1, + info.format + _buffer_format_string_len, + &offset) + f[0] = 0 # Terminate format string + + def __releasebuffer__(ndarray self, Py_buffer* info): + if PyArray_HASFIELDS(self): + stdlib.free(info.format) + if sizeof(npy_intp) != sizeof(Py_ssize_t): + stdlib.free(info.strides) + # info.shape was stored after info.strides in the same block + + + ctypedef signed char npy_bool + + ctypedef signed char npy_byte + ctypedef signed short npy_short + ctypedef signed int npy_int + ctypedef signed long npy_long + ctypedef signed long long npy_longlong + + ctypedef unsigned char npy_ubyte + ctypedef unsigned short npy_ushort + ctypedef unsigned int npy_uint + ctypedef unsigned long npy_ulong + ctypedef unsigned long long npy_ulonglong + + ctypedef float npy_float + ctypedef double npy_double + ctypedef long double npy_longdouble + + ctypedef signed char npy_int8 + ctypedef signed short npy_int16 + ctypedef signed int npy_int32 + ctypedef signed long long npy_int64 + ctypedef signed long long npy_int96 + ctypedef signed long long npy_int128 + + ctypedef unsigned char npy_uint8 + ctypedef unsigned short npy_uint16 + ctypedef unsigned int npy_uint32 + ctypedef unsigned long long npy_uint64 + ctypedef unsigned long long npy_uint96 + ctypedef unsigned long long npy_uint128 + + ctypedef float npy_float32 + ctypedef double npy_float64 + ctypedef long double npy_float80 + ctypedef long double npy_float96 + ctypedef long double npy_float128 + + ctypedef struct npy_cfloat: + double real + double imag + + ctypedef struct npy_cdouble: + double real + double imag + + ctypedef struct npy_clongdouble: + double real + double imag + + ctypedef struct npy_complex64: + double real + double imag + + ctypedef struct npy_complex128: + double real + double imag + + ctypedef struct npy_complex160: + double real + double imag + + ctypedef struct npy_complex192: + double real + double imag + + ctypedef struct npy_complex256: + double real + double imag + + ctypedef struct PyArray_Dims: + npy_intp *ptr + int len + + void import_array() + + # + # Macros from ndarrayobject.h + # + bint PyArray_CHKFLAGS(ndarray m, int flags) + bint PyArray_ISCONTIGUOUS(ndarray m) + bint PyArray_ISWRITEABLE(ndarray m) + bint PyArray_ISALIGNED(ndarray m) + + int PyArray_NDIM(ndarray) + bint PyArray_ISONESEGMENT(ndarray) + bint PyArray_ISFORTRAN(ndarray) + int PyArray_FORTRANIF(ndarray) + + void* PyArray_DATA(ndarray) + char* PyArray_BYTES(ndarray) + npy_intp* PyArray_DIMS(ndarray) + npy_intp* PyArray_STRIDES(ndarray) + npy_intp PyArray_DIM(ndarray, size_t) + npy_intp PyArray_STRIDE(ndarray, size_t) + + # object PyArray_BASE(ndarray) wrong refcount semantics + # dtype PyArray_DESCR(ndarray) wrong refcount semantics + int PyArray_FLAGS(ndarray) + npy_intp PyArray_ITEMSIZE(ndarray) + int PyArray_TYPE(ndarray arr) + + object PyArray_GETITEM(ndarray arr, void *itemptr) + int PyArray_SETITEM(ndarray arr, void *itemptr, object obj) + + bint PyTypeNum_ISBOOL(int) + bint PyTypeNum_ISUNSIGNED(int) + bint PyTypeNum_ISSIGNED(int) + bint PyTypeNum_ISINTEGER(int) + bint PyTypeNum_ISFLOAT(int) + bint PyTypeNum_ISNUMBER(int) + bint PyTypeNum_ISSTRING(int) + bint PyTypeNum_ISCOMPLEX(int) + bint PyTypeNum_ISPYTHON(int) + bint PyTypeNum_ISFLEXIBLE(int) + bint PyTypeNum_ISUSERDEF(int) + bint PyTypeNum_ISEXTENDED(int) + bint PyTypeNum_ISOBJECT(int) + + bint PyDataType_ISBOOL(dtype) + bint PyDataType_ISUNSIGNED(dtype) + bint PyDataType_ISSIGNED(dtype) + bint PyDataType_ISINTEGER(dtype) + bint PyDataType_ISFLOAT(dtype) + bint PyDataType_ISNUMBER(dtype) + bint PyDataType_ISSTRING(dtype) + bint PyDataType_ISCOMPLEX(dtype) + bint PyDataType_ISPYTHON(dtype) + bint PyDataType_ISFLEXIBLE(dtype) + bint PyDataType_ISUSERDEF(dtype) + bint PyDataType_ISEXTENDED(dtype) + bint PyDataType_ISOBJECT(dtype) + bint PyDataType_HASFIELDS(dtype) + + bint PyArray_ISBOOL(ndarray) + bint PyArray_ISUNSIGNED(ndarray) + bint PyArray_ISSIGNED(ndarray) + bint PyArray_ISINTEGER(ndarray) + bint PyArray_ISFLOAT(ndarray) + bint PyArray_ISNUMBER(ndarray) + bint PyArray_ISSTRING(ndarray) + bint PyArray_ISCOMPLEX(ndarray) + bint PyArray_ISPYTHON(ndarray) + bint PyArray_ISFLEXIBLE(ndarray) + bint PyArray_ISUSERDEF(ndarray) + bint PyArray_ISEXTENDED(ndarray) + bint PyArray_ISOBJECT(ndarray) + bint PyArray_HASFIELDS(ndarray) + + bint PyArray_ISVARIABLE(ndarray) + + bint PyArray_SAFEALIGNEDCOPY(ndarray) + bint PyArray_ISNBO(ndarray) + bint PyArray_IsNativeByteOrder(ndarray) + bint PyArray_ISNOTSWAPPED(ndarray) + bint PyArray_ISBYTESWAPPED(ndarray) + + bint PyArray_FLAGSWAP(ndarray, int) + + bint PyArray_ISCARRAY(ndarray) + bint PyArray_ISCARRAY_RO(ndarray) + bint PyArray_ISFARRAY(ndarray) + bint PyArray_ISFARRAY_RO(ndarray) + bint PyArray_ISBEHAVED(ndarray) + bint PyArray_ISBEHAVED_RO(ndarray) + + + bint PyDataType_ISNOTSWAPPED(dtype) + bint PyDataType_ISBYTESWAPPED(dtype) + + bint PyArray_DescrCheck(object) + + bint PyArray_Check(object) + bint PyArray_CheckExact(object) + + # Cannot be supported due to out arg: + # bint PyArray_HasArrayInterfaceType(object, dtype, object, object&) + # bint PyArray_HasArrayInterface(op, out) + + + bint PyArray_IsZeroDim(object) + # Cannot be supported due to ## ## in macro: + # bint PyArray_IsScalar(object, verbatim work) + bint PyArray_CheckScalar(object) + bint PyArray_IsPythonNumber(object) + bint PyArray_IsPythonScalar(object) + bint PyArray_IsAnyScalar(object) + bint PyArray_CheckAnyScalar(object) + ndarray PyArray_GETCONTIGUOUS(ndarray) + bint PyArray_SAMESHAPE(ndarray, ndarray) + npy_intp PyArray_SIZE(ndarray) + npy_intp PyArray_NBYTES(ndarray) + + object PyArray_FROM_O(object) + object PyArray_FROM_OF(object m, int flags) + bint PyArray_FROM_OT(object m, int type) + bint PyArray_FROM_OTF(object m, int type, int flags) + object PyArray_FROMANY(object m, int type, int min, int max, int flags) + object PyArray_ZEROS(int nd, npy_intp* dims, int type, int fortran) + object PyArray_EMPTY(int nd, npy_intp* dims, int type, int fortran) + void PyArray_FILLWBYTE(object, int val) + npy_intp PyArray_REFCOUNT(object) + object PyArray_ContiguousFromAny(op, int, int min_depth, int max_depth) + unsigned char PyArray_EquivArrTypes(ndarray a1, ndarray a2) + bint PyArray_EquivByteorders(int b1, int b2) + object PyArray_SimpleNew(int nd, npy_intp* dims, int typenum) + object PyArray_SimpleNewFromData(int nd, npy_intp* dims, int typenum, void* data) + #object PyArray_SimpleNewFromDescr(int nd, npy_intp* dims, dtype descr) + object PyArray_ToScalar(void* data, ndarray arr) + + void* PyArray_GETPTR1(ndarray m, npy_intp i) + void* PyArray_GETPTR2(ndarray m, npy_intp i, npy_intp j) + void* PyArray_GETPTR3(ndarray m, npy_intp i, npy_intp j, npy_intp k) + void* PyArray_GETPTR4(ndarray m, npy_intp i, npy_intp j, npy_intp k, npy_intp l) + + void PyArray_XDECREF_ERR(ndarray) + # Cannot be supported due to out arg + # void PyArray_DESCR_REPLACE(descr) + + + object PyArray_Copy(ndarray) + object PyArray_FromObject(object op, int type, int min_depth, int max_depth) + object PyArray_ContiguousFromObject(object op, int type, int min_depth, int max_depth) + object PyArray_CopyFromObject(object op, int type, int min_depth, int max_depth) + + object PyArray_Cast(ndarray mp, int type_num) + object PyArray_Take(ndarray ap, object items, int axis) + object PyArray_Put(ndarray ap, object items, object values) + + void PyArray_ITER_RESET(flatiter it) nogil + void PyArray_ITER_NEXT(flatiter it) nogil + void PyArray_ITER_GOTO(flatiter it, npy_intp* destination) nogil + void PyArray_ITER_GOTO1D(flatiter it, npy_intp ind) nogil + void* PyArray_ITER_DATA(flatiter it) nogil + bint PyArray_ITER_NOTDONE(flatiter it) nogil + + void PyArray_MultiIter_RESET(broadcast multi) nogil + void PyArray_MultiIter_NEXT(broadcast multi) nogil + void PyArray_MultiIter_GOTO(broadcast multi, npy_intp dest) nogil + void PyArray_MultiIter_GOTO1D(broadcast multi, npy_intp ind) nogil + void* PyArray_MultiIter_DATA(broadcast multi, npy_intp i) nogil + void PyArray_MultiIter_NEXTi(broadcast multi, npy_intp i) nogil + bint PyArray_MultiIter_NOTDONE(broadcast multi) nogil + + # Functions from __multiarray_api.h + + # Functions taking dtype and returning object/ndarray are disabled + # for now as they steal dtype references. I'm conservative and disable + # more than is probably needed until it can be checked further. + int PyArray_SetNumericOps (object) + object PyArray_GetNumericOps () + int PyArray_INCREF (ndarray) + int PyArray_XDECREF (ndarray) + void PyArray_SetStringFunction (object, int) + dtype PyArray_DescrFromType (int) + object PyArray_TypeObjectFromType (int) + char * PyArray_Zero (ndarray) + char * PyArray_One (ndarray) + #object PyArray_CastToType (ndarray, dtype, int) + int PyArray_CastTo (ndarray, ndarray) + int PyArray_CastAnyTo (ndarray, ndarray) + int PyArray_CanCastSafely (int, int) + npy_bool PyArray_CanCastTo (dtype, dtype) + int PyArray_ObjectType (object, int) + dtype PyArray_DescrFromObject (object, dtype) + #ndarray* PyArray_ConvertToCommonType (object, int *) + dtype PyArray_DescrFromScalar (object) + dtype PyArray_DescrFromTypeObject (object) + npy_intp PyArray_Size (object) + #object PyArray_Scalar (void *, dtype, object) + #object PyArray_FromScalar (object, dtype) + void PyArray_ScalarAsCtype (object, void *) + #int PyArray_CastScalarToCtype (object, void *, dtype) + #int PyArray_CastScalarDirect (object, dtype, void *, int) + object PyArray_ScalarFromObject (object) + #PyArray_VectorUnaryFunc * PyArray_GetCastFunc (dtype, int) + object PyArray_FromDims (int, int *, int) + #object PyArray_FromDimsAndDataAndDescr (int, int *, dtype, char *) + #object PyArray_FromAny (object, dtype, int, int, int, object) + object PyArray_EnsureArray (object) + object PyArray_EnsureAnyArray (object) + #object PyArray_FromFile (stdio.FILE *, dtype, npy_intp, char *) + #object PyArray_FromString (char *, npy_intp, dtype, npy_intp, char *) + #object PyArray_FromBuffer (object, dtype, npy_intp, npy_intp) + #object PyArray_FromIter (object, dtype, npy_intp) + object PyArray_Return (ndarray) + #object PyArray_GetField (ndarray, dtype, int) + #int PyArray_SetField (ndarray, dtype, int, object) + object PyArray_Byteswap (ndarray, npy_bool) + object PyArray_Resize (ndarray, PyArray_Dims *, int, NPY_ORDER) + int PyArray_MoveInto (ndarray, ndarray) + int PyArray_CopyInto (ndarray, ndarray) + int PyArray_CopyAnyInto (ndarray, ndarray) + int PyArray_CopyObject (ndarray, object) + object PyArray_NewCopy (ndarray, NPY_ORDER) + object PyArray_ToList (ndarray) + object PyArray_ToString (ndarray, NPY_ORDER) + int PyArray_ToFile (ndarray, stdio.FILE *, char *, char *) + int PyArray_Dump (object, object, int) + object PyArray_Dumps (object, int) + int PyArray_ValidType (int) + void PyArray_UpdateFlags (ndarray, int) + object PyArray_New (type, int, npy_intp *, int, npy_intp *, void *, int, int, object) + #object PyArray_NewFromDescr (type, dtype, int, npy_intp *, npy_intp *, void *, int, object) + #dtype PyArray_DescrNew (dtype) + dtype PyArray_DescrNewFromType (int) + double PyArray_GetPriority (object, double) + object PyArray_IterNew (object) + object PyArray_MultiIterNew (int, ...) + + int PyArray_PyIntAsInt (object) + npy_intp PyArray_PyIntAsIntp (object) + int PyArray_Broadcast (broadcast) + void PyArray_FillObjectArray (ndarray, object) + int PyArray_FillWithScalar (ndarray, object) + npy_bool PyArray_CheckStrides (int, int, npy_intp, npy_intp, npy_intp *, npy_intp *) + dtype PyArray_DescrNewByteorder (dtype, char) + object PyArray_IterAllButAxis (object, int *) + #object PyArray_CheckFromAny (object, dtype, int, int, int, object) + #object PyArray_FromArray (ndarray, dtype, int) + object PyArray_FromInterface (object) + object PyArray_FromStructInterface (object) + #object PyArray_FromArrayAttr (object, dtype, object) + #NPY_SCALARKIND PyArray_ScalarKind (int, ndarray*) + int PyArray_CanCoerceScalar (int, int, NPY_SCALARKIND) + object PyArray_NewFlagsObject (object) + npy_bool PyArray_CanCastScalar (type, type) + #int PyArray_CompareUCS4 (npy_ucs4 *, npy_ucs4 *, register size_t) + int PyArray_RemoveSmallest (broadcast) + int PyArray_ElementStrides (object) + void PyArray_Item_INCREF (char *, dtype) + void PyArray_Item_XDECREF (char *, dtype) + object PyArray_FieldNames (object) + object PyArray_Transpose (ndarray, PyArray_Dims *) + object PyArray_TakeFrom (ndarray, object, int, ndarray, NPY_CLIPMODE) + object PyArray_PutTo (ndarray, object, object, NPY_CLIPMODE) + object PyArray_PutMask (ndarray, object, object) + object PyArray_Repeat (ndarray, object, int) + object PyArray_Choose (ndarray, object, ndarray, NPY_CLIPMODE) + int PyArray_Sort (ndarray, int, NPY_SORTKIND) + object PyArray_ArgSort (ndarray, int, NPY_SORTKIND) + object PyArray_SearchSorted (ndarray, object, NPY_SEARCHSIDE) + object PyArray_ArgMax (ndarray, int, ndarray) + object PyArray_ArgMin (ndarray, int, ndarray) + object PyArray_Reshape (ndarray, object) + object PyArray_Newshape (ndarray, PyArray_Dims *, NPY_ORDER) + object PyArray_Squeeze (ndarray) + #object PyArray_View (ndarray, dtype, type) + object PyArray_SwapAxes (ndarray, int, int) + object PyArray_Max (ndarray, int, ndarray) + object PyArray_Min (ndarray, int, ndarray) + object PyArray_Ptp (ndarray, int, ndarray) + object PyArray_Mean (ndarray, int, int, ndarray) + object PyArray_Trace (ndarray, int, int, int, int, ndarray) + object PyArray_Diagonal (ndarray, int, int, int) + object PyArray_Clip (ndarray, object, object, ndarray) + object PyArray_Conjugate (ndarray, ndarray) + object PyArray_Nonzero (ndarray) + object PyArray_Std (ndarray, int, int, ndarray, int) + object PyArray_Sum (ndarray, int, int, ndarray) + object PyArray_CumSum (ndarray, int, int, ndarray) + object PyArray_Prod (ndarray, int, int, ndarray) + object PyArray_CumProd (ndarray, int, int, ndarray) + object PyArray_All (ndarray, int, ndarray) + object PyArray_Any (ndarray, int, ndarray) + object PyArray_Compress (ndarray, object, int, ndarray) + object PyArray_Flatten (ndarray, NPY_ORDER) + object PyArray_Ravel (ndarray, NPY_ORDER) + npy_intp PyArray_MultiplyList (npy_intp *, int) + int PyArray_MultiplyIntList (int *, int) + void * PyArray_GetPtr (ndarray, npy_intp*) + int PyArray_CompareLists (npy_intp *, npy_intp *, int) + #int PyArray_AsCArray (object*, void *, npy_intp *, int, dtype) + #int PyArray_As1D (object*, char **, int *, int) + #int PyArray_As2D (object*, char ***, int *, int *, int) + int PyArray_Free (object, void *) + #int PyArray_Converter (object, object*) + int PyArray_IntpFromSequence (object, npy_intp *, int) + object PyArray_Concatenate (object, int) + object PyArray_InnerProduct (object, object) + object PyArray_MatrixProduct (object, object) + object PyArray_CopyAndTranspose (object) + object PyArray_Correlate (object, object, int) + int PyArray_TypestrConvert (int, int) + #int PyArray_DescrConverter (object, dtype*) + #int PyArray_DescrConverter2 (object, dtype*) + int PyArray_IntpConverter (object, PyArray_Dims *) + #int PyArray_BufferConverter (object, chunk) + int PyArray_AxisConverter (object, int *) + int PyArray_BoolConverter (object, npy_bool *) + int PyArray_ByteorderConverter (object, char *) + int PyArray_OrderConverter (object, NPY_ORDER *) + unsigned char PyArray_EquivTypes (dtype, dtype) + #object PyArray_Zeros (int, npy_intp *, dtype, int) + #object PyArray_Empty (int, npy_intp *, dtype, int) + object PyArray_Where (object, object, object) + object PyArray_Arange (double, double, double, int) + #object PyArray_ArangeObj (object, object, object, dtype) + int PyArray_SortkindConverter (object, NPY_SORTKIND *) + object PyArray_LexSort (object, int) + object PyArray_Round (ndarray, int, ndarray) + unsigned char PyArray_EquivTypenums (int, int) + int PyArray_RegisterDataType (dtype) + int PyArray_RegisterCastFunc (dtype, int, PyArray_VectorUnaryFunc *) + int PyArray_RegisterCanCast (dtype, int, NPY_SCALARKIND) + #void PyArray_InitArrFuncs (PyArray_ArrFuncs *) + object PyArray_IntTupleFromIntp (int, npy_intp *) + int PyArray_TypeNumFromName (char *) + int PyArray_ClipmodeConverter (object, NPY_CLIPMODE *) + #int PyArray_OutputConverter (object, ndarray*) + object PyArray_BroadcastToShape (object, npy_intp *, int) + void _PyArray_SigintHandler (int) + void* _PyArray_GetSigintBuf () + #int PyArray_DescrAlignConverter (object, dtype*) + #int PyArray_DescrAlignConverter2 (object, dtype*) + int PyArray_SearchsideConverter (object, void *) + object PyArray_CheckAxis (ndarray, int *, int) + npy_intp PyArray_OverflowMultiplyList (npy_intp *, int) + int PyArray_CompareString (char *, char *, size_t) + + +# Typedefs that matches the runtime dtype objects in +# the numpy module. + +# The ones that are commented out needs an IFDEF function +# in Cython to enable them only on the right systems. + +ctypedef npy_int8 int8_t +ctypedef npy_int16 int16_t +ctypedef npy_int32 int32_t +ctypedef npy_int64 int64_t +#ctypedef npy_int96 int96_t +#ctypedef npy_int128 int128_t + +ctypedef npy_uint8 uint8_t +ctypedef npy_uint16 uint16_t +ctypedef npy_uint32 uint32_t +ctypedef npy_uint64 uint64_t +#ctypedef npy_uint96 uint96_t +#ctypedef npy_uint128 uint128_t + +ctypedef npy_float32 float32_t +ctypedef npy_float64 float64_t +#ctypedef npy_float80 float80_t +#ctypedef npy_float128 float128_t + +ctypedef float complex complex64_t +ctypedef double complex complex128_t + +# The int types are mapped a bit surprising -- +# numpy.int corresponds to 'l' and numpy.long to 'q' +ctypedef npy_long int_t +ctypedef npy_longlong long_t +ctypedef npy_longlong longlong_t + +ctypedef npy_ulong uint_t +ctypedef npy_ulonglong ulong_t +ctypedef npy_ulonglong ulonglong_t + +ctypedef npy_intp intp_t +ctypedef npy_uintp uintp_t + +ctypedef npy_double float_t +ctypedef npy_double double_t +ctypedef npy_longdouble longdouble_t + +ctypedef npy_cfloat cfloat_t +ctypedef npy_cdouble cdouble_t +ctypedef npy_clongdouble clongdouble_t + +ctypedef npy_cdouble complex_t + +cdef inline object PyArray_MultiIterNew1(a): + return PyArray_MultiIterNew(1, a) + +cdef inline object PyArray_MultiIterNew2(a, b): + return PyArray_MultiIterNew(2, a, b) + +cdef inline object PyArray_MultiIterNew3(a, b, c): + return PyArray_MultiIterNew(3, a, b, c) + +cdef inline object PyArray_MultiIterNew4(a, b, c, d): + return PyArray_MultiIterNew(4, a, b, c, d) + +cdef inline object PyArray_MultiIterNew5(a, b, c, d, e): + return PyArray_MultiIterNew(5, a, b, c, d, e) + +cdef inline char* _util_dtypestring(dtype descr, char* f, char* end, int* offset) except NULL: + # Recursive utility function used in __getbuffer__ to get format + # string. The new location in the format string is returned. + + cdef dtype child + cdef int delta_offset + cdef tuple i + cdef int endian_detector = 1 + cdef bint little_endian = ((&endian_detector)[0] != 0) + cdef tuple fields + + for childname in descr.names: + fields = descr.fields[childname] + child, new_offset = fields + + if (end - f) - (new_offset - offset[0]) < 15: + raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd") + + if ((child.byteorder == '>' and little_endian) or + (child.byteorder == '<' and not little_endian)): + raise ValueError(u"Non-native byte order not supported") + # One could encode it in the format string and have Cython + # complain instead, BUT: < and > in format strings also imply + # standardized sizes for datatypes, and we rely on native in + # order to avoid reencoding data types based on their size. + # + # A proper PEP 3118 exporter for other clients than Cython + # must deal properly with this! + + # Output padding bytes + while offset[0] < new_offset: + f[0] = 120 # "x"; pad byte + f += 1 + offset[0] += 1 + + offset[0] += child.itemsize + + if not PyDataType_HASFIELDS(child): + t = child.type_num + if end - f < 5: + raise RuntimeError(u"Format string allocated too short.") + + # Until ticket #99 is fixed, use integers to avoid warnings + if t == NPY_BYTE: f[0] = 98 #"b" + elif t == NPY_UBYTE: f[0] = 66 #"B" + elif t == NPY_SHORT: f[0] = 104 #"h" + elif t == NPY_USHORT: f[0] = 72 #"H" + elif t == NPY_INT: f[0] = 105 #"i" + elif t == NPY_UINT: f[0] = 73 #"I" + elif t == NPY_LONG: f[0] = 108 #"l" + elif t == NPY_ULONG: f[0] = 76 #"L" + elif t == NPY_LONGLONG: f[0] = 113 #"q" + elif t == NPY_ULONGLONG: f[0] = 81 #"Q" + elif t == NPY_FLOAT: f[0] = 102 #"f" + elif t == NPY_DOUBLE: f[0] = 100 #"d" + elif t == NPY_LONGDOUBLE: f[0] = 103 #"g" + elif t == NPY_CFLOAT: f[0] = 90; f[1] = 102; f += 1 # Zf + elif t == NPY_CDOUBLE: f[0] = 90; f[1] = 100; f += 1 # Zd + elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg + elif t == NPY_OBJECT: f[0] = 79 #"O" + else: + raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t) + f += 1 + else: + # Cython ignores struct boundary information ("T{...}"), + # so don't output it + f = _util_dtypestring(child, f, end, offset) + return f + + +# +# ufunc API +# + +cdef extern from "numpy/ufuncobject.h": + + ctypedef void (*PyUFuncGenericFunction) (char **, npy_intp *, npy_intp *, void *) + + ctypedef extern class numpy.ufunc [object PyUFuncObject]: + cdef: + int nin, nout, nargs + int identity + PyUFuncGenericFunction *functions + void **data + int ntypes + int check_return + char *name, *types + char *doc + void *ptr + PyObject *obj + PyObject *userloops + + cdef enum: + PyUFunc_Zero + PyUFunc_One + PyUFunc_None + UFUNC_ERR_IGNORE + UFUNC_ERR_WARN + UFUNC_ERR_RAISE + UFUNC_ERR_CALL + UFUNC_ERR_PRINT + UFUNC_ERR_LOG + UFUNC_MASK_DIVIDEBYZERO + UFUNC_MASK_OVERFLOW + UFUNC_MASK_UNDERFLOW + UFUNC_MASK_INVALID + UFUNC_SHIFT_DIVIDEBYZERO + UFUNC_SHIFT_OVERFLOW + UFUNC_SHIFT_UNDERFLOW + UFUNC_SHIFT_INVALID + UFUNC_FPE_DIVIDEBYZERO + UFUNC_FPE_OVERFLOW + UFUNC_FPE_UNDERFLOW + UFUNC_FPE_INVALID + UFUNC_ERR_DEFAULT + UFUNC_ERR_DEFAULT2 + + object PyUFunc_FromFuncAndData(PyUFuncGenericFunction *, + void **, char *, int, int, int, int, char *, char *, int) + int PyUFunc_RegisterLoopForType(ufunc, int, + PyUFuncGenericFunction, int *, void *) + int PyUFunc_GenericFunction \ + (ufunc, PyObject *, PyObject *, PyArrayObject **) + void PyUFunc_f_f_As_d_d \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_d_d \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_f_f \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_g_g \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_F_F_As_D_D \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_F_F \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_D_D \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_G_G \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_O_O \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_ff_f_As_dd_d \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_ff_f \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_dd_d \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_gg_g \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_FF_F_As_DD_D \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_DD_D \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_FF_F \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_GG_G \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_OO_O \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_O_O_method \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_OO_O_method \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_On_Om \ + (char **, npy_intp *, npy_intp *, void *) + int PyUFunc_GetPyValues \ + (char *, int *, int *, PyObject **) + int PyUFunc_checkfperr \ + (int, PyObject *, int *) + void PyUFunc_clearfperr() + int PyUFunc_getfperr() + int PyUFunc_handlefperr \ + (int, PyObject *, int, int *) + int PyUFunc_ReplaceLoopBySignature \ + (ufunc, PyUFuncGenericFunction, int *, PyUFuncGenericFunction *) + object PyUFunc_FromFuncAndDataAndSignature \ + (PyUFuncGenericFunction *, void **, char *, int, int, int, + int, char *, char *, int, char *) + + void import_ufunc() + + +cdef inline void set_array_base(ndarray arr, object base): + cdef PyObject* baseptr + if base is None: + baseptr = NULL + else: + Py_INCREF(base) # important to do this before decref below! + baseptr = base + Py_XDECREF(arr.base) + arr.base = baseptr + +cdef inline object get_array_base(ndarray arr): + if arr.base is NULL: + return None + else: + return arr.base diff --git a/pandas/src/numpy_helper.h b/pandas/src/numpy_helper.h index b2fecfdd7ed35..b63835119fb35 100644 --- a/pandas/src/numpy_helper.h +++ b/pandas/src/numpy_helper.h @@ -64,6 +64,10 @@ PANDAS_INLINE int is_float_object(PyObject* obj) { return (PyFloat_Check(obj) || PyArray_IsScalar(obj, Floating)); } +PANDAS_INLINE int +is_complex_object(PyObject* obj) { + return (PyComplex_Check(obj) || PyArray_IsScalar(obj, ComplexFloating)); +} PANDAS_INLINE int is_bool_object(PyObject* obj) { diff --git a/pandas/src/period.c b/pandas/src/period.c index d7c3260f71866..447a183c19821 100644 --- a/pandas/src/period.c +++ b/pandas/src/period.c @@ -1,6 +1,5 @@ #include "period.h" -#include "limits.h" -// #include "numpy/ndarraytypes.h" + /* * Borrowed and derived code from scikits.timeseries that we will expose via @@ -14,6 +13,13 @@ * Code derived from scikits.timeseries * ------------------------------------------------------------------*/ + +static int mod_compat(int x, int m) { + int result = x % m; + if (result < 0) return result + m; + return result; +} + static asfreq_info NULL_AF_INFO; /* Table with day offsets for each month (0-based, without and with leap) */ @@ -29,7 +35,7 @@ static int days_in_month[2][12] = { }; /* Return 1/0 iff year points to a leap year in calendar. */ -static int dInfoCalc_Leapyear(int64_t year, int calendar) +static int dInfoCalc_Leapyear(npy_int64 year, int calendar) { if (calendar == GREGORIAN_CALENDAR) { return (year % 4 == 0) && ((year % 100 != 0) || (year % 400 == 0)); @@ -39,7 +45,7 @@ static int dInfoCalc_Leapyear(int64_t year, int calendar) } /* Return the day of the week for the given absolute date. */ -static int dInfoCalc_DayOfWeek(int64_t absdate) +static int dInfoCalc_DayOfWeek(npy_int64 absdate) { int day_of_week; @@ -61,7 +67,7 @@ static int monthToQuarter(int month) { return ((month-1)/3)+1; } using the Gregorian Epoch) value by two days because the Epoch (0001-01-01) in the Julian calendar lies 2 days before the Epoch in the Gregorian calendar. */ -static int dInfoCalc_YearOffset(int64_t year, int calendar) +static int dInfoCalc_YearOffset(npy_int64 year, int calendar) { year--; if (calendar == GREGORIAN_CALENDAR) { @@ -93,7 +99,8 @@ static int dInfoCalc_SetFromDateAndTime(struct date_info *dinfo, /* Calculate the absolute date */ { int leap; - int64_t yearoffset,absdate; + npy_int64 absdate; + int yearoffset; /* Range check */ Py_AssertWithArg(year > -(INT_MAX / 366) && year < (INT_MAX / 366), @@ -173,19 +180,18 @@ static int dInfoCalc_SetFromDateAndTime(struct date_info *dinfo, than with this iterative approach... */ static int dInfoCalc_SetFromAbsDate(register struct date_info *dinfo, - int64_t absdate, - int calendar) + npy_int64 absdate, int calendar) { - register int64_t year; - int64_t yearoffset; + register npy_int64 year; + npy_int64 yearoffset; int leap,dayoffset; int *monthoffset; /* Approximate year */ if (calendar == GREGORIAN_CALENDAR) { - year = (int64_t)(((double)absdate) / 365.2425); + year = (npy_int64)(((double)absdate) / 365.2425); } else if (calendar == JULIAN_CALENDAR) { - year = (int64_t)(((double)absdate) / 365.25); + year = (npy_int64)(((double)absdate) / 365.25); } else { Py_Error(PyExc_ValueError, "unknown calendar"); } @@ -194,7 +200,7 @@ int dInfoCalc_SetFromAbsDate(register struct date_info *dinfo, /* Apply corrections to reach the correct year */ while (1) { /* Calculate the year offset */ - yearoffset = dInfoCalc_YearOffset(year,calendar); + yearoffset = dInfoCalc_YearOffset(year, calendar); if (PyErr_Occurred()) goto onError; @@ -254,12 +260,11 @@ int dInfoCalc_SetFromAbsDate(register struct date_info *dinfo, // helpers for frequency conversion routines // -static int64_t DtoB_weekday(int64_t fromDate) { - return (((fromDate) / 7) * 5) + (fromDate)%7; +static npy_int64 DtoB_weekday(npy_int64 absdate) { + return (((absdate) / 7) * 5) + (absdate) % 7 - BDAY_OFFSET; } -static int64_t DtoB_WeekendToMonday(int64_t absdate, int day_of_week) { - +static npy_int64 DtoB_WeekendToMonday(npy_int64 absdate, int day_of_week) { if (day_of_week > 4) { //change to Monday after weekend absdate += (7 - day_of_week); @@ -267,7 +272,7 @@ static int64_t DtoB_WeekendToMonday(int64_t absdate, int day_of_week) { return DtoB_weekday(absdate); } -static int64_t DtoB_WeekendToFriday(int64_t absdate, int day_of_week) { +static npy_int64 DtoB_WeekendToFriday(npy_int64 absdate, int day_of_week) { if (day_of_week > 4) { //change to friday before weekend absdate -= (day_of_week - 4); @@ -275,7 +280,7 @@ static int64_t DtoB_WeekendToFriday(int64_t absdate, int day_of_week) { return DtoB_weekday(absdate); } -static int64_t absdate_from_ymd(int y, int m, int d) { +static npy_int64 absdate_from_ymd(int y, int m, int d) { struct date_info tempDate; if (dInfoCalc_SetFromDateAndTime(&tempDate, y, m, d, 0, 0, 0, GREGORIAN_CALENDAR)) { return INT_ERR_CODE; @@ -285,20 +290,24 @@ static int64_t absdate_from_ymd(int y, int m, int d) { //************ FROM DAILY *************** -static int64_t asfreq_DtoA(int64_t fromDate, char relation, asfreq_info *af_info) { +static npy_int64 asfreq_DtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) { struct date_info dinfo; - if (dInfoCalc_SetFromAbsDate(&dinfo, fromDate, - GREGORIAN_CALENDAR)) return INT_ERR_CODE; - if (dinfo.month > af_info->to_a_year_end) { return (int64_t)(dinfo.year + 1); } - else { return (int64_t)(dinfo.year); } + if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, + GREGORIAN_CALENDAR)) return INT_ERR_CODE; + if (dinfo.month > af_info->to_a_year_end) { + return (npy_int64)(dinfo.year + 1 - BASE_YEAR); + } + else { + return (npy_int64)(dinfo.year - BASE_YEAR); + } } -static int64_t DtoQ_yq(int64_t fromDate, asfreq_info *af_info, - int *year, int *quarter) { +static npy_int64 DtoQ_yq(npy_int64 ordinal, asfreq_info *af_info, + int *year, int *quarter) { struct date_info dinfo; - if (dInfoCalc_SetFromAbsDate(&dinfo, fromDate, - GREGORIAN_CALENDAR)) return INT_ERR_CODE; + if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, + GREGORIAN_CALENDAR)) return INT_ERR_CODE; if (af_info->to_q_year_end != 12) { dinfo.month -= af_info->to_q_year_end; if (dinfo.month <= 0) { dinfo.month += 12; } @@ -313,34 +322,34 @@ static int64_t DtoQ_yq(int64_t fromDate, asfreq_info *af_info, } -static int64_t asfreq_DtoQ(int64_t fromDate, char relation, asfreq_info *af_info) { +static npy_int64 asfreq_DtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) { int year, quarter; - if (DtoQ_yq(fromDate, af_info, &year, &quarter) == INT_ERR_CODE) { + if (DtoQ_yq(ordinal, af_info, &year, &quarter) == INT_ERR_CODE) { return INT_ERR_CODE; } - return (int64_t)((year - 1) * 4 + quarter); + return (npy_int64)((year - BASE_YEAR) * 4 + quarter - 1); } -static int64_t asfreq_DtoM(int64_t fromDate, char relation, asfreq_info *af_info) { +static npy_int64 asfreq_DtoM(npy_int64 ordinal, char relation, asfreq_info *af_info) { struct date_info dinfo; - if (dInfoCalc_SetFromAbsDate(&dinfo, fromDate, GREGORIAN_CALENDAR)) + if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, GREGORIAN_CALENDAR)) return INT_ERR_CODE; - return (int64_t)((dinfo.year - 1) * 12 + dinfo.month); + return (npy_int64)((dinfo.year - BASE_YEAR) * 12 + dinfo.month - 1); } -static int64_t asfreq_DtoW(int64_t fromDate, char relation, asfreq_info *af_info) { - return (fromDate - (1 + af_info->to_week_end))/7 + 1; +static npy_int64 asfreq_DtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return (ordinal + ORD_OFFSET - (1 + af_info->to_week_end))/7 + 1 - WEEK_OFFSET; } -static int64_t asfreq_DtoB(int64_t fromDate, char relation, asfreq_info *af_info) { +static npy_int64 asfreq_DtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { struct date_info dinfo; - if (dInfoCalc_SetFromAbsDate(&dinfo, fromDate, - GREGORIAN_CALENDAR)) return INT_ERR_CODE; + if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, + GREGORIAN_CALENDAR)) return INT_ERR_CODE; if (relation == 'S') { return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); @@ -349,222 +358,240 @@ static int64_t asfreq_DtoB(int64_t fromDate, char relation, asfreq_info *af_info } } -static int64_t asfreq_DtoB_forConvert(int64_t fromDate, char relation, asfreq_info *af_info) { - - struct date_info dinfo; - if (dInfoCalc_SetFromAbsDate(&dinfo, fromDate, GREGORIAN_CALENDAR)) - return INT_ERR_CODE; - - if (dinfo.day_of_week > 4) { - return INT_ERR_CODE; - } else { - return DtoB_weekday(fromDate); - } -} - // needed for getDateInfo function -static int64_t asfreq_DtoD(int64_t fromDate, char relation, asfreq_info *af_info) { return fromDate; } +static npy_int64 asfreq_DtoD(npy_int64 ordinal, char relation, asfreq_info *af_info) { return ordinal; } -static int64_t asfreq_DtoHIGHFREQ(int64_t fromDate, char relation, int64_t periodsPerDay) { - if (fromDate >= HIGHFREQ_ORIG) { - if (relation == 'S') { return (fromDate - HIGHFREQ_ORIG)*(periodsPerDay) + 1; } - else { return (fromDate - HIGHFREQ_ORIG + 1)*(periodsPerDay); } - } else { return INT_ERR_CODE; } +static npy_int64 asfreq_DtoHIGHFREQ(npy_int64 ordinal, char relation, npy_int64 per_day) { + if (relation == 'S') { + return ordinal * per_day; + } + else { + return (ordinal+ 1) * per_day - 1; + } } -static int64_t asfreq_DtoH(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoHIGHFREQ(fromDate, relation, 24); } -static int64_t asfreq_DtoT(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoHIGHFREQ(fromDate, relation, 24*60); } -static int64_t asfreq_DtoS(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoHIGHFREQ(fromDate, relation, 24*60*60); } +static npy_int64 asfreq_DtoH(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoHIGHFREQ(ordinal, relation, 24); } +static npy_int64 asfreq_DtoT(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoHIGHFREQ(ordinal, relation, 24*60); } +static npy_int64 asfreq_DtoS(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoHIGHFREQ(ordinal, relation, 24*60*60); } //************ FROM SECONDLY *************** -static int64_t asfreq_StoD(int64_t fromDate, char relation, asfreq_info *af_info) - { return (fromDate - 1)/(60*60*24) + HIGHFREQ_ORIG; } - -static int64_t asfreq_StoA(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoA(asfreq_StoD(fromDate, relation, &NULL_AF_INFO), relation, af_info); } -static int64_t asfreq_StoQ(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoQ(asfreq_StoD(fromDate, relation, &NULL_AF_INFO), relation, af_info); } -static int64_t asfreq_StoM(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoM(asfreq_StoD(fromDate, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } -static int64_t asfreq_StoW(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoW(asfreq_StoD(fromDate, relation, &NULL_AF_INFO), relation, af_info); } -static int64_t asfreq_StoB(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoB(asfreq_StoD(fromDate, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } -static int64_t asfreq_StoB_forConvert(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoB_forConvert(asfreq_StoD(fromDate, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } -static int64_t asfreq_StoT(int64_t fromDate, char relation, asfreq_info *af_info) - { return (fromDate - 1)/60 + 1; } -static int64_t asfreq_StoH(int64_t fromDate, char relation, asfreq_info *af_info) - { return (fromDate - 1)/(60*60) + 1; } +static npy_int64 asfreq_StoD(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return (ordinal)/(60*60*24); } + +static npy_int64 asfreq_StoA(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoA(asfreq_StoD(ordinal, relation, &NULL_AF_INFO), relation, af_info); } + +static npy_int64 asfreq_StoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoQ(asfreq_StoD(ordinal, relation, &NULL_AF_INFO), relation, af_info); } + +static npy_int64 asfreq_StoM(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoM(asfreq_StoD(ordinal, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } + +static npy_int64 asfreq_StoW(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoW(asfreq_StoD(ordinal, relation, &NULL_AF_INFO), relation, af_info); } + +static npy_int64 asfreq_StoB(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoB(asfreq_StoD(ordinal, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } + + +static npy_int64 asfreq_StoT(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return ordinal / 60; +} + +static npy_int64 asfreq_StoH(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return ordinal / (60*60); +} //************ FROM MINUTELY *************** -static int64_t asfreq_TtoD(int64_t fromDate, char relation, asfreq_info *af_info) - { return (fromDate - 1)/(60*24) + HIGHFREQ_ORIG; } - -static int64_t asfreq_TtoA(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoA(asfreq_TtoD(fromDate, relation, &NULL_AF_INFO), relation, af_info); } -static int64_t asfreq_TtoQ(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoQ(asfreq_TtoD(fromDate, relation, &NULL_AF_INFO), relation, af_info); } -static int64_t asfreq_TtoM(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoM(asfreq_TtoD(fromDate, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } -static int64_t asfreq_TtoW(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoW(asfreq_TtoD(fromDate, relation, &NULL_AF_INFO), relation, af_info); } -static int64_t asfreq_TtoB(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoB(asfreq_TtoD(fromDate, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } - -static int64_t asfreq_TtoB_forConvert(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoB_forConvert(asfreq_TtoD(fromDate, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } - -static int64_t asfreq_TtoH(int64_t fromDate, char relation, asfreq_info *af_info) - { return (fromDate - 1)/60 + 1; } -static int64_t asfreq_TtoS(int64_t fromDate, char relation, asfreq_info *af_info) { - if (relation == 'S') { return fromDate*60 - 59; } - else { return fromDate*60; }} +static npy_int64 asfreq_TtoD(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return (ordinal)/(60*24); } + +static npy_int64 asfreq_TtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoA(asfreq_TtoD(ordinal, relation, &NULL_AF_INFO), relation, af_info); } +static npy_int64 asfreq_TtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoQ(asfreq_TtoD(ordinal, relation, &NULL_AF_INFO), relation, af_info); } +static npy_int64 asfreq_TtoM(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoM(asfreq_TtoD(ordinal, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_TtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoW(asfreq_TtoD(ordinal, relation, &NULL_AF_INFO), relation, af_info); } +static npy_int64 asfreq_TtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoB(asfreq_TtoD(ordinal, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } + +static npy_int64 asfreq_TtoH(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return ordinal / 60; +} + +static npy_int64 asfreq_TtoS(npy_int64 ordinal, char relation, asfreq_info *af_info) { + if (relation == 'S') { + return ordinal*60; } + else { + return ordinal*60 + 59; + } +} //************ FROM HOURLY *************** -static int64_t asfreq_HtoD(int64_t fromDate, char relation, asfreq_info *af_info) - { return (fromDate - 1)/24 + HIGHFREQ_ORIG; } -static int64_t asfreq_HtoA(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoA(asfreq_HtoD(fromDate, relation, &NULL_AF_INFO), relation, af_info); } -static int64_t asfreq_HtoQ(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoQ(asfreq_HtoD(fromDate, relation, &NULL_AF_INFO), relation, af_info); } -static int64_t asfreq_HtoM(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoM(asfreq_HtoD(fromDate, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } -static int64_t asfreq_HtoW(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoW(asfreq_HtoD(fromDate, relation, &NULL_AF_INFO), relation, af_info); } -static int64_t asfreq_HtoB(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoB(asfreq_HtoD(fromDate, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } - -static int64_t asfreq_HtoB_forConvert(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoB_forConvert(asfreq_HtoD(fromDate, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_HtoD(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return ordinal / 24; } +static npy_int64 asfreq_HtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoA(asfreq_HtoD(ordinal, relation, &NULL_AF_INFO), relation, af_info); } +static npy_int64 asfreq_HtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoQ(asfreq_HtoD(ordinal, relation, &NULL_AF_INFO), relation, af_info); } +static npy_int64 asfreq_HtoM(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoM(asfreq_HtoD(ordinal, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_HtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoW(asfreq_HtoD(ordinal, relation, &NULL_AF_INFO), relation, af_info); } +static npy_int64 asfreq_HtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoB(asfreq_HtoD(ordinal, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } // calculation works out the same as TtoS, so we just call that function for HtoT -static int64_t asfreq_HtoT(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_TtoS(fromDate, relation, &NULL_AF_INFO); } -static int64_t asfreq_HtoS(int64_t fromDate, char relation, asfreq_info *af_info) { - if (relation == 'S') { return fromDate*60*60 - 60*60 + 1; } - else { return fromDate*60*60; }} +static npy_int64 asfreq_HtoT(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_TtoS(ordinal, relation, &NULL_AF_INFO); } + +static npy_int64 asfreq_HtoS(npy_int64 ordinal, char relation, asfreq_info *af_info) { + if (relation == 'S') { + return ordinal*60*60; + } + else { + return (ordinal + 1)*60*60 - 1; + } +} //************ FROM BUSINESS *************** -static int64_t asfreq_BtoD(int64_t fromDate, char relation, asfreq_info *af_info) - { return ((fromDate-1)/5)*7 + (fromDate-1)%5 + 1; } +static npy_int64 asfreq_BtoD(npy_int64 ordinal, char relation, asfreq_info *af_info) + { + ordinal += BDAY_OFFSET; + return (((ordinal - 1) / 5) * 7 + + mod_compat(ordinal - 1, 5) + 1 - ORD_OFFSET); + } -static int64_t asfreq_BtoA(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoA(asfreq_BtoD(fromDate, relation, &NULL_AF_INFO), relation, af_info); } +static npy_int64 asfreq_BtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoA(asfreq_BtoD(ordinal, relation, &NULL_AF_INFO), relation, af_info); } -static int64_t asfreq_BtoQ(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoQ(asfreq_BtoD(fromDate, relation, &NULL_AF_INFO), relation, af_info); } +static npy_int64 asfreq_BtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoQ(asfreq_BtoD(ordinal, relation, &NULL_AF_INFO), relation, af_info); } -static int64_t asfreq_BtoM(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoM(asfreq_BtoD(fromDate, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_BtoM(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoM(asfreq_BtoD(ordinal, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } -static int64_t asfreq_BtoW(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoW(asfreq_BtoD(fromDate, relation, &NULL_AF_INFO), relation, af_info); } +static npy_int64 asfreq_BtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoW(asfreq_BtoD(ordinal, relation, &NULL_AF_INFO), relation, af_info); } -static int64_t asfreq_BtoH(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoH(asfreq_BtoD(fromDate, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_BtoH(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoH(asfreq_BtoD(ordinal, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } -static int64_t asfreq_BtoT(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoT(asfreq_BtoD(fromDate, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_BtoT(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoT(asfreq_BtoD(ordinal, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } -static int64_t asfreq_BtoS(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoS(asfreq_BtoD(fromDate, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_BtoS(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoS(asfreq_BtoD(ordinal, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } //************ FROM WEEKLY *************** -static int64_t asfreq_WtoD(int64_t fromDate, char relation, asfreq_info *af_info) { - if (relation == 'S') { return fromDate * 7 - 6 + af_info->from_week_end;} - else { return fromDate * 7 + af_info->from_week_end; } +static npy_int64 asfreq_WtoD(npy_int64 ordinal, char relation, asfreq_info *af_info) { + ordinal += WEEK_OFFSET; + if (relation == 'S') { + return ordinal * 7 - 6 + af_info->from_week_end - ORD_OFFSET; + } + else { + return ordinal * 7 + af_info->from_week_end - ORD_OFFSET; + } } -static int64_t asfreq_WtoA(int64_t fromDate, char relation, asfreq_info *af_info) { - return asfreq_DtoA(asfreq_WtoD(fromDate, 'E', af_info), relation, af_info); } -static int64_t asfreq_WtoQ(int64_t fromDate, char relation, asfreq_info *af_info) { - return asfreq_DtoQ(asfreq_WtoD(fromDate, 'E', af_info), relation, af_info); } -static int64_t asfreq_WtoM(int64_t fromDate, char relation, asfreq_info *af_info) { - return asfreq_DtoM(asfreq_WtoD(fromDate, 'E', af_info), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_WtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return asfreq_DtoA(asfreq_WtoD(ordinal, 'E', af_info), relation, af_info); } +static npy_int64 asfreq_WtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return asfreq_DtoQ(asfreq_WtoD(ordinal, 'E', af_info), relation, af_info); } +static npy_int64 asfreq_WtoM(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return asfreq_DtoM(asfreq_WtoD(ordinal, 'E', af_info), relation, &NULL_AF_INFO); } -static int64_t asfreq_WtoW(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoW(asfreq_WtoD(fromDate, relation, af_info), relation, af_info); } +static npy_int64 asfreq_WtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoW(asfreq_WtoD(ordinal, relation, af_info), relation, af_info); } -static int64_t asfreq_WtoB(int64_t fromDate, char relation, asfreq_info *af_info) { +static npy_int64 asfreq_WtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { struct date_info dinfo; - if (dInfoCalc_SetFromAbsDate(&dinfo, asfreq_WtoD(fromDate, relation, af_info), + if (dInfoCalc_SetFromAbsDate(&dinfo, + asfreq_WtoD(ordinal, relation, af_info) + ORD_OFFSET, GREGORIAN_CALENDAR)) return INT_ERR_CODE; - if (relation == 'S') { return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); } - else { return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); } + if (relation == 'S') { + return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); + } + else { + return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); + } } -static int64_t asfreq_WtoH(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoH(asfreq_WtoD(fromDate, relation, af_info), relation, &NULL_AF_INFO); } -static int64_t asfreq_WtoT(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoT(asfreq_WtoD(fromDate, relation, af_info), relation, &NULL_AF_INFO); } -static int64_t asfreq_WtoS(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoS(asfreq_WtoD(fromDate, relation, af_info), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_WtoH(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoH(asfreq_WtoD(ordinal, relation, af_info), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_WtoT(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoT(asfreq_WtoD(ordinal, relation, af_info), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_WtoS(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoS(asfreq_WtoD(ordinal, relation, af_info), relation, &NULL_AF_INFO); } //************ FROM MONTHLY *************** -static void MtoD_ym(int64_t fromDate, int64_t *y, int64_t *m) { - *y = (fromDate - 1) / 12 + 1; - *m = fromDate - 12 * (*y) - 1; +static void MtoD_ym(npy_int64 ordinal, int *y, int *m) { + *y = ordinal / 12 + BASE_YEAR; + *m = mod_compat(ordinal, 12) + 1; } -static int64_t asfreq_MtoD(int64_t fromDate, char relation, asfreq_info *af_info) { - int64_t y, m, absdate; +static npy_int64 asfreq_MtoD(npy_int64 ordinal, char relation, asfreq_info *af_info) { + + npy_int64 absdate; + int y, m; if (relation == 'S') { - MtoD_ym(fromDate, &y, &m); + MtoD_ym(ordinal, &y, &m); if ((absdate = absdate_from_ymd(y, m, 1)) == INT_ERR_CODE) return INT_ERR_CODE; - return absdate; + return absdate - ORD_OFFSET; } else { - MtoD_ym(fromDate+1, &y, &m); + MtoD_ym(ordinal + 1, &y, &m); if ((absdate = absdate_from_ymd(y, m, 1)) == INT_ERR_CODE) return INT_ERR_CODE; - return absdate-1; + return absdate - 1 - ORD_OFFSET; } } -static int64_t asfreq_MtoA(int64_t fromDate, char relation, asfreq_info *af_info) { - return asfreq_DtoA(asfreq_MtoD(fromDate, 'E', &NULL_AF_INFO), relation, af_info); } +static npy_int64 asfreq_MtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return asfreq_DtoA(asfreq_MtoD(ordinal, 'E', &NULL_AF_INFO), relation, af_info); } -static int64_t asfreq_MtoQ(int64_t fromDate, char relation, asfreq_info *af_info) { - return asfreq_DtoQ(asfreq_MtoD(fromDate, 'E', &NULL_AF_INFO), relation, af_info); } +static npy_int64 asfreq_MtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return asfreq_DtoQ(asfreq_MtoD(ordinal, 'E', &NULL_AF_INFO), relation, af_info); } -static int64_t asfreq_MtoW(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoW(asfreq_MtoD(fromDate, relation, &NULL_AF_INFO), relation, af_info); } +static npy_int64 asfreq_MtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoW(asfreq_MtoD(ordinal, relation, &NULL_AF_INFO), relation, af_info); } -static int64_t asfreq_MtoB(int64_t fromDate, char relation, asfreq_info *af_info) { +static npy_int64 asfreq_MtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { struct date_info dinfo; - if (dInfoCalc_SetFromAbsDate(&dinfo, asfreq_MtoD(fromDate, relation, &NULL_AF_INFO), - GREGORIAN_CALENDAR)) return INT_ERR_CODE; + if (dInfoCalc_SetFromAbsDate(&dinfo, + asfreq_MtoD(ordinal, relation, &NULL_AF_INFO) + ORD_OFFSET, + GREGORIAN_CALENDAR)) return INT_ERR_CODE; if (relation == 'S') { return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); } else { return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); } } -static int64_t asfreq_MtoH(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoH(asfreq_MtoD(fromDate, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } -static int64_t asfreq_MtoT(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoT(asfreq_MtoD(fromDate, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } -static int64_t asfreq_MtoS(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoS(asfreq_MtoD(fromDate, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_MtoH(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoH(asfreq_MtoD(ordinal, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_MtoT(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoT(asfreq_MtoD(ordinal, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_MtoS(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoS(asfreq_MtoD(ordinal, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } //************ FROM QUARTERLY *************** -static void QtoD_ym(int64_t fromDate, int64_t *y, int64_t *m, asfreq_info *af_info) { - - *y = (fromDate - 1) / 4 + 1; - *m = (fromDate + 4) * 3 - 12 * (*y) - 2; +static void QtoD_ym(npy_int64 ordinal, int *y, int *m, asfreq_info *af_info) { + *y = ordinal / 4 + BASE_YEAR; + *m = (ordinal % 4) * 3 + 1; if (af_info->from_q_year_end != 12) { *m += af_info->from_q_year_end; @@ -573,106 +600,117 @@ static void QtoD_ym(int64_t fromDate, int64_t *y, int64_t *m, asfreq_info *af_in } } -static int64_t asfreq_QtoD(int64_t fromDate, char relation, asfreq_info *af_info) { +static npy_int64 asfreq_QtoD(npy_int64 ordinal, char relation, asfreq_info *af_info) { - int64_t y, m, absdate; + npy_int64 absdate; + int y, m; if (relation == 'S') { - QtoD_ym(fromDate, &y, &m, af_info); + QtoD_ym(ordinal, &y, &m, af_info); + // printf("ordinal: %d, year: %d, month: %d\n", (int) ordinal, y, m); if ((absdate = absdate_from_ymd(y, m, 1)) == INT_ERR_CODE) return INT_ERR_CODE; - return absdate; + return absdate - ORD_OFFSET; } else { - QtoD_ym(fromDate+1, &y, &m, af_info); + QtoD_ym(ordinal+1, &y, &m, af_info); + // printf("ordinal: %d, year: %d, month: %d\n", (int) ordinal, y, m); if ((absdate = absdate_from_ymd(y, m, 1)) == INT_ERR_CODE) return INT_ERR_CODE; - return absdate - 1; + return absdate - 1 - ORD_OFFSET; } } -static int64_t asfreq_QtoQ(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoQ(asfreq_QtoD(fromDate, relation, af_info), relation, af_info); } +static npy_int64 asfreq_QtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoQ(asfreq_QtoD(ordinal, relation, af_info), relation, af_info); } -static int64_t asfreq_QtoA(int64_t fromDate, char relation, asfreq_info *af_info) { - return asfreq_DtoA(asfreq_QtoD(fromDate, relation, af_info), relation, af_info); } +static npy_int64 asfreq_QtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return asfreq_DtoA(asfreq_QtoD(ordinal, relation, af_info), relation, af_info); } -static int64_t asfreq_QtoM(int64_t fromDate, char relation, asfreq_info *af_info) { - return asfreq_DtoM(asfreq_QtoD(fromDate, relation, af_info), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_QtoM(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return asfreq_DtoM(asfreq_QtoD(ordinal, relation, af_info), relation, &NULL_AF_INFO); } -static int64_t asfreq_QtoW(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoW(asfreq_QtoD(fromDate, relation, af_info), relation, af_info); } +static npy_int64 asfreq_QtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoW(asfreq_QtoD(ordinal, relation, af_info), relation, af_info); } -static int64_t asfreq_QtoB(int64_t fromDate, char relation, asfreq_info *af_info) { +static npy_int64 asfreq_QtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { struct date_info dinfo; - if (dInfoCalc_SetFromAbsDate(&dinfo, asfreq_QtoD(fromDate, relation, af_info), - GREGORIAN_CALENDAR)) return INT_ERR_CODE; + if (dInfoCalc_SetFromAbsDate(&dinfo, + asfreq_QtoD(ordinal, relation, af_info) + ORD_OFFSET, + GREGORIAN_CALENDAR)) return INT_ERR_CODE; if (relation == 'S') { return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); } else { return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); } } -static int64_t asfreq_QtoH(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoH(asfreq_QtoD(fromDate, relation, af_info), relation, &NULL_AF_INFO); } -static int64_t asfreq_QtoT(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoT(asfreq_QtoD(fromDate, relation, af_info), relation, &NULL_AF_INFO); } -static int64_t asfreq_QtoS(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoS(asfreq_QtoD(fromDate, relation, af_info), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_QtoH(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoH(asfreq_QtoD(ordinal, relation, af_info), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_QtoT(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoT(asfreq_QtoD(ordinal, relation, af_info), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_QtoS(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoS(asfreq_QtoD(ordinal, relation, af_info), relation, &NULL_AF_INFO); } //************ FROM ANNUAL *************** -static int64_t asfreq_AtoD(int64_t fromDate, char relation, asfreq_info *af_info) { - int64_t absdate, year, final_adj; +static npy_int64 asfreq_AtoD(npy_int64 ordinal, char relation, asfreq_info *af_info) { + npy_int64 absdate, final_adj; + int year; int month = (af_info->from_a_year_end) % 12; + // start from 1970 + ordinal += BASE_YEAR; + if (month == 0) { month = 1; } else { month += 1; } if (relation == 'S') { - if (af_info->from_a_year_end == 12) {year = fromDate;} - else {year = fromDate - 1;} + if (af_info->from_a_year_end == 12) {year = ordinal;} + else {year = ordinal - 1;} final_adj = 0; } else { - if (af_info->from_a_year_end == 12) {year = fromDate+1;} - else {year = fromDate;} + if (af_info->from_a_year_end == 12) {year = ordinal+1;} + else {year = ordinal;} final_adj = -1; } absdate = absdate_from_ymd(year, month, 1); - if (absdate == INT_ERR_CODE) return INT_ERR_CODE; - return absdate + final_adj; + if (absdate == INT_ERR_CODE) { + return INT_ERR_CODE; + } + return absdate + final_adj - ORD_OFFSET; } -static int64_t asfreq_AtoA(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoA(asfreq_AtoD(fromDate, relation, af_info), relation, af_info); } +static npy_int64 asfreq_AtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoA(asfreq_AtoD(ordinal, relation, af_info), relation, af_info); } -static int64_t asfreq_AtoQ(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoQ(asfreq_AtoD(fromDate, relation, af_info), relation, af_info); } +static npy_int64 asfreq_AtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoQ(asfreq_AtoD(ordinal, relation, af_info), relation, af_info); } -static int64_t asfreq_AtoM(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoM(asfreq_AtoD(fromDate, relation, af_info), relation, af_info); } +static npy_int64 asfreq_AtoM(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoM(asfreq_AtoD(ordinal, relation, af_info), relation, af_info); } -static int64_t asfreq_AtoW(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoW(asfreq_AtoD(fromDate, relation, af_info), relation, af_info); } +static npy_int64 asfreq_AtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoW(asfreq_AtoD(ordinal, relation, af_info), relation, af_info); } -static int64_t asfreq_AtoB(int64_t fromDate, char relation, asfreq_info *af_info) { +static npy_int64 asfreq_AtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { struct date_info dinfo; - if (dInfoCalc_SetFromAbsDate(&dinfo, asfreq_AtoD(fromDate, relation, af_info), + if (dInfoCalc_SetFromAbsDate(&dinfo, + asfreq_AtoD(ordinal, relation, af_info) + ORD_OFFSET, GREGORIAN_CALENDAR)) return INT_ERR_CODE; if (relation == 'S') { return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); } else { return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); } } -static int64_t asfreq_AtoH(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoH(asfreq_AtoD(fromDate, relation, af_info), relation, &NULL_AF_INFO); } -static int64_t asfreq_AtoT(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoT(asfreq_AtoD(fromDate, relation, af_info), relation, &NULL_AF_INFO); } -static int64_t asfreq_AtoS(int64_t fromDate, char relation, asfreq_info *af_info) - { return asfreq_DtoS(asfreq_AtoD(fromDate, relation, af_info), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_AtoH(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoH(asfreq_AtoD(ordinal, relation, af_info), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_AtoT(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoT(asfreq_AtoD(ordinal, relation, af_info), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_AtoS(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoS(asfreq_AtoD(ordinal, relation, af_info), relation, &NULL_AF_INFO); } -static int64_t nofunc(int64_t fromDate, char relation, asfreq_info *af_info) { return INT_ERR_CODE; } -static int64_t no_op(int64_t fromDate, char relation, asfreq_info *af_info) { return fromDate; } +static npy_int64 nofunc(npy_int64 ordinal, char relation, asfreq_info *af_info) { return INT_ERR_CODE; } +static npy_int64 no_op(npy_int64 ordinal, char relation, asfreq_info *af_info) { return ordinal; } // end of frequency specific conversion routines @@ -720,7 +758,7 @@ void get_asfreq_info(int fromFreq, int toFreq, asfreq_info *af_info) { } -freq_conv_func get_asfreq_func(int fromFreq, int toFreq, int forConvert) +freq_conv_func get_asfreq_func(int fromFreq, int toFreq) { int fromGroup = get_freq_group(fromFreq); int toGroup = get_freq_group(toFreq); @@ -811,9 +849,7 @@ freq_conv_func get_asfreq_func(int fromFreq, int toFreq, int forConvert) case FR_QTR: return &asfreq_DtoQ; case FR_MTH: return &asfreq_DtoM; case FR_WK: return &asfreq_DtoW; - case FR_BUS: - if (forConvert) { return &asfreq_DtoB_forConvert; } - else { return &asfreq_DtoB; } + case FR_BUS: return &asfreq_DtoB; case FR_DAY: return &asfreq_DtoD; case FR_HR: return &asfreq_DtoH; case FR_MIN: return &asfreq_DtoT; @@ -828,9 +864,7 @@ freq_conv_func get_asfreq_func(int fromFreq, int toFreq, int forConvert) case FR_QTR: return &asfreq_HtoQ; case FR_MTH: return &asfreq_HtoM; case FR_WK: return &asfreq_HtoW; - case FR_BUS: - if (forConvert) { return &asfreq_HtoB_forConvert; } - else { return &asfreq_HtoB; } + case FR_BUS: return &asfreq_HtoB; case FR_DAY: return &asfreq_HtoD; case FR_HR: return &no_op; case FR_MIN: return &asfreq_HtoT; @@ -845,9 +879,7 @@ freq_conv_func get_asfreq_func(int fromFreq, int toFreq, int forConvert) case FR_QTR: return &asfreq_TtoQ; case FR_MTH: return &asfreq_TtoM; case FR_WK: return &asfreq_TtoW; - case FR_BUS: - if (forConvert) { return &asfreq_TtoB_forConvert; } - else { return &asfreq_TtoB; } + case FR_BUS: return &asfreq_TtoB; case FR_DAY: return &asfreq_TtoD; case FR_HR: return &asfreq_TtoH; case FR_MIN: return &no_op; @@ -862,9 +894,7 @@ freq_conv_func get_asfreq_func(int fromFreq, int toFreq, int forConvert) case FR_QTR: return &asfreq_StoQ; case FR_MTH: return &asfreq_StoM; case FR_WK: return &asfreq_StoW; - case FR_BUS: - if (forConvert) { return &asfreq_StoB_forConvert; } - else { return &asfreq_StoB; } + case FR_BUS: return &asfreq_StoB; case FR_DAY: return &asfreq_StoD; case FR_HR: return &asfreq_StoH; case FR_MIN: return &asfreq_StoT; @@ -875,27 +905,35 @@ freq_conv_func get_asfreq_func(int fromFreq, int toFreq, int forConvert) } } -double getAbsTime(int freq, int64_t dailyDate, int64_t originalDate) { - - int64_t startOfDay, periodsPerDay; +double get_abs_time(int freq, npy_int64 daily_ord, npy_int64 ordinal) { + npy_int64 start_ord, per_day, unit; switch(freq) { case FR_HR: - periodsPerDay = 24; + per_day = 24; + unit = 60 * 60; break; case FR_MIN: - periodsPerDay = 24*60; + per_day = 24*60; + unit = 60; break; case FR_SEC: - periodsPerDay = 24*60*60; + per_day = 24*60*60; + unit = 1; break; default: return 0; // 24*60*60 - 1; } - startOfDay = asfreq_DtoHIGHFREQ(dailyDate, 'S', periodsPerDay); - return (24*60*60)*((double)(originalDate - startOfDay))/((double)periodsPerDay); + start_ord = asfreq_DtoHIGHFREQ(daily_ord, 'S', per_day); + /* printf("start_ord: %d\n", start_ord); */ + return (double) ( unit * (ordinal - start_ord)); + /* if (ordinal >= 0) { */ + /* } */ + /* else { */ + /* return (double) (unit * mod_compat(ordinal - start_ord, per_day)); */ + /* } */ } /* Sets the time part of the DateTime object. */ @@ -926,7 +964,7 @@ int dInfoCalc_SetFromAbsTime(struct date_info *dinfo, indicate the calendar to be used. */ static int dInfoCalc_SetFromAbsDateTime(struct date_info *dinfo, - int64_t absdate, + npy_int64 absdate, double abstime, int calendar) { @@ -938,15 +976,10 @@ int dInfoCalc_SetFromAbsDateTime(struct date_info *dinfo, abstime); /* Calculate the date */ - if (dInfoCalc_SetFromAbsDate(dinfo, - absdate, - calendar)) - goto onError; + if (dInfoCalc_SetFromAbsDate(dinfo, absdate, calendar)) goto onError; /* Calculate the time */ - if (dInfoCalc_SetFromAbsTime(dinfo, - abstime)) - goto onError; + if (dInfoCalc_SetFromAbsTime(dinfo, abstime)) goto onError; return 0; onError: @@ -957,19 +990,19 @@ int dInfoCalc_SetFromAbsDateTime(struct date_info *dinfo, * New pandas API-helper code, to expose to cython * ------------------------------------------------------------------*/ -int64_t asfreq(int64_t period_ordinal, int freq1, int freq2, char relation) +npy_int64 asfreq(npy_int64 period_ordinal, int freq1, int freq2, char relation) { - int64_t val; + npy_int64 val; freq_conv_func func; asfreq_info finfo; - func = get_asfreq_func(freq1, freq2, 0); + func = get_asfreq_func(freq1, freq2); get_asfreq_info(freq1, freq2, &finfo); val = (*func)(period_ordinal, relation, &finfo); if (val == INT_ERR_CODE) { - Py_Error(PyExc_ValueError, "Unable to convert to desired frequency."); + // Py_Error(PyExc_ValueError, "Unable to convert to desired frequency."); goto onError; } return val; @@ -977,27 +1010,28 @@ int64_t asfreq(int64_t period_ordinal, int freq1, int freq2, char relation) return INT_ERR_CODE; } + /* generate an ordinal in period space */ -int64_t get_period_ordinal(int year, int month, int day, +npy_int64 get_period_ordinal(int year, int month, int day, int hour, int minute, int second, int freq) { - int64_t absdays, delta; - int64_t weeks, days; - int64_t adj_ordinal, ordinal, day_adj; + npy_int64 absdays, delta; + npy_int64 weeks, days; + npy_int64 adj_ordinal, ordinal, day_adj; int freq_group, fmonth, mdiff, quarter; freq_group = get_freq_group(freq); if (freq == FR_SEC) { absdays = absdate_from_ymd(year, month, day); - delta = (absdays - HIGHFREQ_ORIG); - return (int64_t)(delta*86400 + hour*3600 + minute*60 + second + 1); + delta = (absdays - ORD_OFFSET); + return (npy_int64)(delta*86400 + hour*3600 + minute*60 + second); } if (freq == FR_MIN) { absdays = absdate_from_ymd(year, month, day); - delta = (absdays - HIGHFREQ_ORIG); - return (int64_t)(delta*1440 + hour*60 + minute + 1); + delta = (absdays - ORD_OFFSET); + return (npy_int64)(delta*1440 + hour*60 + minute); } if (freq == FR_HR) { @@ -1005,18 +1039,18 @@ int64_t get_period_ordinal(int year, int month, int day, { goto onError; } - delta = (absdays - HIGHFREQ_ORIG); - return (int64_t)(delta*24 + hour + 1); + delta = (absdays - ORD_OFFSET); + return (npy_int64)(delta*24 + hour); } if (freq == FR_DAY) { - return (int64_t)absdate_from_ymd(year, month, day); + return (npy_int64) (absdate_from_ymd(year, month, day) - ORD_OFFSET); } if (freq == FR_UND) { - return (int64_t)absdate_from_ymd(year, month, day); + return (npy_int64) (absdate_from_ymd(year, month, day) - ORD_OFFSET); } if (freq == FR_BUS) @@ -1025,24 +1059,24 @@ int64_t get_period_ordinal(int year, int month, int day, { goto onError; } - weeks = days/7; - return (int64_t)(days - weeks*2); + weeks = days / 7; + return (npy_int64)(days - weeks * 2) - BDAY_OFFSET; } if (freq_group == FR_WK) { - if((ordinal = (int64_t)absdate_from_ymd(year, month, day)) == INT_ERR_CODE) + if((ordinal = (npy_int64)absdate_from_ymd(year, month, day)) == INT_ERR_CODE) { goto onError; } day_adj = (7 - (freq - FR_WK)) % 7; adj_ordinal = ordinal + ((7 - day_adj) - ordinal % 7) % 7; - return adj_ordinal/7; + return adj_ordinal / 7 - WEEK_OFFSET; } if (freq == FR_MTH) { - return (year-1)*12 + month; + return (year - BASE_YEAR) * 12 + month - 1; } if (freq_group == FR_QTR) @@ -1054,7 +1088,7 @@ int64_t get_period_ordinal(int year, int month, int day, if (mdiff < 0) mdiff += 12; if (month >= fmonth) mdiff += 12; - return 1 + (year - 1) * 4 + (mdiff - 1) / 3; + return (year - BASE_YEAR) * 4 + (mdiff - 1) / 3; } if (freq_group == FR_ANN) @@ -1062,10 +1096,10 @@ int64_t get_period_ordinal(int year, int month, int day, fmonth = freq - FR_ANN; if (fmonth == 0) fmonth = 12; if (month <= fmonth) { - return year; + return year - BASE_YEAR; } else { - return year + 1; + return year - BASE_YEAR + 1; } } @@ -1082,17 +1116,17 @@ int64_t get_period_ordinal(int year, int month, int day, is calculated for the last day of the period. */ -int64_t get_python_ordinal(int64_t period_ordinal, int freq) +npy_int64 get_python_ordinal(npy_int64 period_ordinal, int freq) { asfreq_info af_info; - int64_t (*toDaily)(int64_t, char, asfreq_info*); + npy_int64 (*toDaily)(npy_int64, char, asfreq_info*); if (freq == FR_DAY) - return period_ordinal; + return period_ordinal + ORD_OFFSET; - toDaily = get_asfreq_func(freq, FR_DAY, 0); + toDaily = get_asfreq_func(freq, FR_DAY); get_asfreq_info(freq, FR_DAY, &af_info); - return toDaily(period_ordinal, 'E', &af_info); + return toDaily(period_ordinal, 'E', &af_info) + ORD_OFFSET; } char *str_replace(const char *s, const char *old, const char *new) { @@ -1129,7 +1163,7 @@ char *str_replace(const char *s, const char *old, const char *new) { // function to generate a nice string representation of the period // object, originally from DateObject_strftime -char *skts_strftime(int64_t value, int freq, PyObject *args) +char *skts_strftime(npy_int64 ordinal, int freq, PyObject *args) { char *orig_fmt_str, *fmt_str; char *result; @@ -1144,24 +1178,31 @@ char *skts_strftime(int64_t value, int freq, PyObject *args) int extra_fmts_found_one = 0; struct tm c_date; struct date_info tempDate; - int64_t absdate; + npy_int64 absdate, daily_ord; double abstime; int i, result_len; PyObject *py_result; - int64_t (*toDaily)(int64_t, char, asfreq_info*) = NULL; + npy_int64 (*toDaily)(npy_int64, char, asfreq_info*) = NULL; asfreq_info af_info; if (!PyArg_ParseTuple(args, "s:strftime(fmt)", &orig_fmt_str)) return NULL; - toDaily = get_asfreq_func(freq, FR_DAY, 0); + toDaily = get_asfreq_func(freq, FR_DAY); get_asfreq_info(freq, FR_DAY, &af_info); - absdate = toDaily(value, 'E', &af_info); - abstime = getAbsTime(freq, absdate, value); + daily_ord = toDaily(ordinal, 'E', &af_info); + abstime = get_abs_time(freq, daily_ord, ordinal); - if(dInfoCalc_SetFromAbsDateTime(&tempDate, absdate, abstime, + if (abstime < 0) { + abstime += 86400; + daily_ord -= 1; + } + + /* printf("daily_ord: %d, abstime: %f \n", (int) daily_ord, abstime); */ + + if(dInfoCalc_SetFromAbsDateTime(&tempDate, daily_ord + ORD_OFFSET, abstime, GREGORIAN_CALENDAR)) return NULL; // populate standard C date struct with info from our date_info struct @@ -1221,7 +1262,7 @@ char *skts_strftime(int64_t value, int freq, PyObject *args) } else { qtr_freq = FR_QTR; } get_asfreq_info(FR_DAY, qtr_freq, &af_info); - if(DtoQ_yq(absdate, &af_info, &year, &quarter) == INT_ERR_CODE) + if(DtoQ_yq(daily_ord, &af_info, &year, &quarter) == INT_ERR_CODE) { return NULL; } if(strcmp(extra_fmts[i][0], "%q") == 0) { @@ -1263,7 +1304,7 @@ char *skts_strftime(int64_t value, int freq, PyObject *args) return result; } -char *period_to_string(int64_t value, int freq) +char *period_to_string(npy_int64 value, int freq) { int freq_group = get_freq_group(freq); PyObject *string_arg; @@ -1275,7 +1316,7 @@ char *period_to_string(int64_t value, int freq) if ((retval = PyArray_malloc(digits * sizeof(char))) == NULL) { return (char *)PyErr_NoMemory(); } - sprintf(retval, "%ld", value); + sprintf(retval, "%ld", (long int) value); return retval; } else if (freq_group == FR_ANN) { string_arg = Py_BuildValue("(s)", "%Y"); } @@ -1296,7 +1337,7 @@ char *period_to_string(int64_t value, int freq) return retval; } -char *period_to_string2(int64_t value, int freq, char *fmt) +char *period_to_string2(npy_int64 value, int freq, char *fmt) { PyObject *string_arg; char *retval; @@ -1307,11 +1348,11 @@ char *period_to_string2(int64_t value, int freq, char *fmt) return retval; } -static int _quarter_year(int64_t ordinal, int freq, int *year, int *quarter) { +static int _quarter_year(npy_int64 ordinal, int freq, int *year, int *quarter) { asfreq_info af_info; int qtr_freq; - ordinal = get_python_ordinal(ordinal, freq); + ordinal = get_python_ordinal(ordinal, freq) - ORD_OFFSET; if (get_freq_group(freq) == FR_QTR) qtr_freq = freq; @@ -1355,94 +1396,100 @@ static int _ISOWeek(struct date_info *dinfo) return week; } -int get_date_info(int64_t ordinal, int freq, struct date_info *dinfo) +int get_date_info(npy_int64 ordinal, int freq, struct date_info *dinfo) { - int64_t absdate = get_python_ordinal(ordinal, freq); - double abstime = getAbsTime(freq, absdate, ordinal); + npy_int64 absdate = get_python_ordinal(ordinal, freq); + /* printf("freq: %d, absdate: %d\n", freq, (int) absdate); */ + double abstime = get_abs_time(freq, absdate - ORD_OFFSET, ordinal); + if (abstime < 0) { + abstime += 86400; + absdate -= 1; + } - if(dInfoCalc_SetFromAbsDateTime(dinfo, absdate, abstime, GREGORIAN_CALENDAR)) + if(dInfoCalc_SetFromAbsDateTime(dinfo, absdate, + abstime, GREGORIAN_CALENDAR)) return INT_ERR_CODE; return 0; } -int pyear(int64_t ordinal, int freq) { +int pyear(npy_int64 ordinal, int freq) { struct date_info dinfo; get_date_info(ordinal, freq, &dinfo); return dinfo.year; } -int pqyear(int64_t ordinal, int freq) { +int pqyear(npy_int64 ordinal, int freq) { int year, quarter; if( _quarter_year(ordinal, freq, &year, &quarter) == INT_ERR_CODE) return INT_ERR_CODE; return year; } -int pquarter(int64_t ordinal, int freq) { +int pquarter(npy_int64 ordinal, int freq) { int year, quarter; if(_quarter_year(ordinal, freq, &year, &quarter) == INT_ERR_CODE) return INT_ERR_CODE; return quarter; } -int pmonth(int64_t ordinal, int freq) { +int pmonth(npy_int64 ordinal, int freq) { struct date_info dinfo; if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) return INT_ERR_CODE; return dinfo.month; } -int pday(int64_t ordinal, int freq) { +int pday(npy_int64 ordinal, int freq) { struct date_info dinfo; if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) return INT_ERR_CODE; return dinfo.day; } -int pweekday(int64_t ordinal, int freq) { +int pweekday(npy_int64 ordinal, int freq) { struct date_info dinfo; if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) return INT_ERR_CODE; return dinfo.day_of_week; } -int pday_of_week(int64_t ordinal, int freq) { +int pday_of_week(npy_int64 ordinal, int freq) { struct date_info dinfo; if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) return INT_ERR_CODE; return dinfo.day_of_week; } -int pday_of_year(int64_t ordinal, int freq) { +int pday_of_year(npy_int64 ordinal, int freq) { struct date_info dinfo; if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) return INT_ERR_CODE; return dinfo.day_of_year; } -int pweek(int64_t ordinal, int freq) { +int pweek(npy_int64 ordinal, int freq) { struct date_info dinfo; if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) return INT_ERR_CODE; return _ISOWeek(&dinfo); } -int phour(int64_t ordinal, int freq) { +int phour(npy_int64 ordinal, int freq) { struct date_info dinfo; if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) return INT_ERR_CODE; return dinfo.hour; } -int pminute(int64_t ordinal, int freq) { +int pminute(npy_int64 ordinal, int freq) { struct date_info dinfo; if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) return INT_ERR_CODE; return dinfo.minute; } -int psecond(int64_t ordinal, int freq) { +int psecond(npy_int64 ordinal, int freq) { struct date_info dinfo; if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) return INT_ERR_CODE; diff --git a/pandas/src/period.h b/pandas/src/period.h index f1e4f476ec924..1ece756b8fb75 100644 --- a/pandas/src/period.h +++ b/pandas/src/period.h @@ -10,6 +10,7 @@ #include #include "numpy/ndarraytypes.h" #include "stdint.h" +#include "limits.h" /* * declarations from period here @@ -28,13 +29,15 @@ // HIGHFREQ_ORIG is the datetime ordinal from which to begin the second // frequency ordinal sequence -// begins second ordinal at 1/1/1AD gregorian proleptic calendar -#define HIGHFREQ_ORIG 1 - // typedef int64_t npy_int64; - // begins second ordinal at 1/1/1970 unix epoch -// #define HIGHFREQ_ORIG 719163 + +// #define HIGHFREQ_ORIG 62135683200LL +#define BASE_YEAR 1970 +#define ORD_OFFSET 719163LL // days until 1970-01-01 +#define BDAY_OFFSET 513689LL // days until 1970-01-01 +#define WEEK_OFFSET 102737LL +#define HIGHFREQ_ORIG 0 // ORD_OFFSET * 86400LL // days until 1970-01-01 #define FR_ANN 1000 /* Annual */ #define FR_ANNDEC FR_ANN /* Annual - December year end*/ @@ -85,7 +88,7 @@ #define FR_UND -10000 /* Undefined */ -#define INT_ERR_CODE -1 +#define INT_ERR_CODE INT32_MIN #define MEM_CHECK(item) if (item == NULL) { return PyErr_NoMemory(); } #define ERR_CHECK(item) if (item == NULL) { return NULL; } @@ -103,7 +106,7 @@ typedef struct asfreq_info { typedef struct date_info { - int64_t absdate; + npy_int64 absdate; double abstime; double second; @@ -118,40 +121,40 @@ typedef struct date_info { int calendar; } date_info; -typedef int64_t (*freq_conv_func)(int64_t, char, asfreq_info*); +typedef npy_int64 (*freq_conv_func)(npy_int64, char, asfreq_info*); /* * new pandas API helper functions here */ -int64_t asfreq(int64_t period_ordinal, int freq1, int freq2, char relation); +npy_int64 asfreq(npy_int64 period_ordinal, int freq1, int freq2, char relation); -int64_t get_period_ordinal(int year, int month, int day, +npy_int64 get_period_ordinal(int year, int month, int day, int hour, int minute, int second, int freq); -int64_t get_python_ordinal(int64_t period_ordinal, int freq); +npy_int64 get_python_ordinal(npy_int64 period_ordinal, int freq); -char *skts_strftime(int64_t value, int freq, PyObject *args); -char *period_to_string(int64_t value, int freq); -char *period_to_string2(int64_t value, int freq, char *fmt); +char *skts_strftime(npy_int64 value, int freq, PyObject *args); +char *period_to_string(npy_int64 value, int freq); +char *period_to_string2(npy_int64 value, int freq, char *fmt); -int get_date_info(int64_t ordinal, int freq, struct date_info *dinfo); -freq_conv_func get_asfreq_func(int fromFreq, int toFreq, int forConvert); +int get_date_info(npy_int64 ordinal, int freq, struct date_info *dinfo); +freq_conv_func get_asfreq_func(int fromFreq, int toFreq); void get_asfreq_info(int fromFreq, int toFreq, asfreq_info *af_info); -int pyear(int64_t ordinal, int freq); -int pqyear(int64_t ordinal, int freq); -int pquarter(int64_t ordinal, int freq); -int pmonth(int64_t ordinal, int freq); -int pday(int64_t ordinal, int freq); -int pweekday(int64_t ordinal, int freq); -int pday_of_week(int64_t ordinal, int freq); -int pday_of_year(int64_t ordinal, int freq); -int pweek(int64_t ordinal, int freq); -int phour(int64_t ordinal, int freq); -int pminute(int64_t ordinal, int freq); -int psecond(int64_t ordinal, int freq); -double getAbsTime(int freq, int64_t dailyDate, int64_t originalDate); +int pyear(npy_int64 ordinal, int freq); +int pqyear(npy_int64 ordinal, int freq); +int pquarter(npy_int64 ordinal, int freq); +int pmonth(npy_int64 ordinal, int freq); +int pday(npy_int64 ordinal, int freq); +int pweekday(npy_int64 ordinal, int freq); +int pday_of_week(npy_int64 ordinal, int freq); +int pday_of_year(npy_int64 ordinal, int freq); +int pweek(npy_int64 ordinal, int freq); +int phour(npy_int64 ordinal, int freq); +int pminute(npy_int64 ordinal, int freq); +int psecond(npy_int64 ordinal, int freq); +double getAbsTime(int freq, npy_int64 dailyDate, npy_int64 originalDate); #endif diff --git a/pandas/src/reduce.pyx b/pandas/src/reduce.pyx index 3aa6388a144e1..49cdddb4b7740 100644 --- a/pandas/src/reduce.pyx +++ b/pandas/src/reduce.pyx @@ -9,6 +9,7 @@ cdef class Reducer: cdef: Py_ssize_t increment, chunksize, nresults object arr, dummy, f, labels + bint can_set_name def __init__(self, object arr, object f, axis=1, dummy=None, labels=None): @@ -37,12 +38,14 @@ cdef class Reducer: def _check_dummy(self, dummy=None): if dummy is None: dummy = np.empty(self.chunksize, dtype=self.arr.dtype) + self.can_set_name = 0 else: if dummy.dtype != self.arr.dtype: raise ValueError('Dummy array must be same dtype') if len(dummy) != self.chunksize: raise ValueError('Dummy array must be length %d' % self.chunksize) + self.can_set_name = type(dummy) != np.ndarray return dummy @@ -54,7 +57,7 @@ cdef class Reducer: flatiter it object res bint set_label = 0 - ndarray[object] labels + ndarray labels arr = self.arr chunk = self.dummy @@ -62,18 +65,14 @@ cdef class Reducer: dummy_buf = chunk.data chunk.data = arr.data - set_label = self.labels is not None - + set_label = self.labels is not None and self.can_set_name if set_label: - if not np.issubdtype(self.labels.dtype, object): - labels = self.labels.astype('O') - else: - labels = self.labels + labels = self.labels try: for i in range(self.nresults): if set_label: - chunk.name = labels[i] + chunk.name = util.get_value_at(labels, i) res = self.f(chunk) if i == 0: @@ -86,11 +85,14 @@ cdef class Reducer: except Exception, e: if hasattr(e, 'args'): e.args = e.args + (i,) + raise finally: # so we don't free the wrong memory chunk.data = dummy_buf + if result.dtype == np.object_: result = maybe_convert_objects(result) + return result def _get_result_array(self, object res): diff --git a/pandas/src/sandbox.pyx b/pandas/src/sandbox.pyx index c161ca6ad3c98..dabeb7cf3371c 100644 --- a/pandas/src/sandbox.pyx +++ b/pandas/src/sandbox.pyx @@ -421,117 +421,6 @@ def int64_unique(ndarray[int64_t] arr): return np.sort(uniques[:j]) -def group_add_bin(ndarray[float64_t, ndim=2] out, - ndarray[int32_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int32_t] bins): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, N, K, ngroups, b - float64_t val, count - ndarray[float64_t, ndim=2] sumx, nobs - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - - ngroups = len(bins) + 1 - N, K = ( values).shape - - b = 0 - if K > 1: - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[b, j] += 1 - sumx[b, j] += val - else: - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[b, 0] += 1 - sumx[b, 0] += val - print i, b, counts, nobs.squeeze() - - for i in range(ngroups): - print 'writing %d' % i - for j in range(K): - if nobs[i] == 0: - out[i, j] = nan - else: - out[i, j] = sumx[i, j] - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_add(ndarray[float64_t, ndim=2] out, - ndarray[int32_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int32_t] labels): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, N, K, lab - float64_t val, count - ndarray[float64_t, ndim=2] sumx, nobs - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - - N, K = ( values).shape - - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[lab, 0] += 1 - sumx[lab, 0] += val - - for i in range(len(counts)): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = sumx[i, j] - - -from datetime cimport getAbsTime - # cdef extern from "kvec.h": @@ -546,12 +435,6 @@ def test_foo(ndarray[int64_t] values): val = values[0] print val -def get_abs_time(freq, dailyDate, originalDate): - return getAbsTime(freq, dailyDate, originalDate) - -have_pytz = 1 -import pytz - # cdef extern from "foo.h": # double add_things(double *a, double *b, double *c, int n) @@ -581,3 +464,37 @@ def inner(ndarray[float64_t] x, ndarray[float64_t] y): for i in range(n): result += x[i] * y[i] return result + +def indices_fast(ndarray[int64_t] labels, list keys, + list sorted_labels): + cdef: + Py_ssize_t i, j, k, lab, cur, start, n = len(labels) + dict result = {} + object tup + + index = np.arange(n) + + k = len(keys) + + if n == 0: + return result + + start = 0 + cur = labels[0] + for i in range(1, n): + lab = labels[i] + + if lab != cur: + if lab != -1: + tup = PyTuple_New(k) + for j in range(k): + val = util.get_value_at(keys[j], + sorted_labels[j][cur]) + PyTuple_SET_ITEM(tup, j, val) + Py_INCREF(val) + + result[tup] = index[start:i] + start = i + cur = lab + + return result diff --git a/pandas/src/tseries.pyx b/pandas/src/tseries.pyx index 8f8ce424d07ed..8db04bc6396ad 100644 --- a/pandas/src/tseries.pyx +++ b/pandas/src/tseries.pyx @@ -1,7 +1,32 @@ cimport numpy as np cimport cython +import numpy as np from numpy cimport * +from numpy cimport NPY_INT32 as NPY_int32 +from numpy cimport NPY_INT64 as NPY_int64 +from numpy cimport NPY_FLOAT32 as NPY_float32 +from numpy cimport NPY_FLOAT64 as NPY_float64 + +int32 = np.dtype(np.int32) +int64 = np.dtype(np.int64) +float32 = np.dtype(np.float32) +float64 = np.dtype(np.float64) + +cdef np.int32_t MINint32 = np.iinfo(np.int32).min +cdef np.int64_t MINint64 = np.iinfo(np.int64).min +cdef np.float32_t MINfloat32 = np.NINF +cdef np.float64_t MINfloat64 = np.NINF + +cdef np.int32_t MAXint32 = np.iinfo(np.int32).max +cdef np.int64_t MAXint64 = np.iinfo(np.int64).max +cdef np.float32_t MAXfloat32 = np.inf +cdef np.float64_t MAXfloat64 = np.inf + + +cdef extern from "numpy/arrayobject.h": + cdef enum NPY_TYPES: + NPY_intp "NPY_INTP" from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, PyDict_Contains, PyDict_Keys, @@ -11,10 +36,10 @@ from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, from cpython cimport PyFloat_Check cimport cpython -import numpy as np isnan = np.isnan cdef double NaN = np.NaN cdef double nan = NaN +cdef double NAN = nan from datetime import datetime as pydatetime @@ -156,7 +181,7 @@ cdef double INF = np.inf cdef double NEGINF = -INF cpdef checknull(object val): - if util.is_float_object(val): + if util.is_float_object(val) or util.is_complex_object(val): return val != val or val == INF or val == NEGINF elif util.is_datetime64_object(val): return val.view('i8') == NaT @@ -379,7 +404,6 @@ def fast_zip(list ndarrays): return result - def get_reverse_indexer(ndarray[int64_t] indexer, Py_ssize_t length): cdef: Py_ssize_t i, n = len(indexer) @@ -671,7 +695,6 @@ include "skiplist.pyx" include "groupby.pyx" include "moments.pyx" include "reindex.pyx" -include "generated.pyx" include "reduce.pyx" include "stats.pyx" include "properties.pyx" diff --git a/pandas/src/ujson/lib/ultrajson.h b/pandas/src/ujson/lib/ultrajson.h new file mode 100644 index 0000000000000..0514236e750e1 --- /dev/null +++ b/pandas/src/ujson/lib/ultrajson.h @@ -0,0 +1,301 @@ +/* +Copyright (c) 2011, Jonas Tarnstrom and ESN Social Software AB +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +3. All advertising materials mentioning features or use of this software + must display the following acknowledgement: + This product includes software developed by ESN Social Software AB (www.esn.me). +4. Neither the name of the ESN Social Software AB nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY ESN SOCIAL SOFTWARE AB ''AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Portions of code from: +MODP_ASCII - Ascii transformations (upper/lower, etc) +http://code.google.com/p/stringencoders/ +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. + +*/ + +/* +Ultra fast JSON encoder and decoder +Developed by Jonas Tarnstrom (jonas@esn.me). + +Encoder notes: +------------------ + +:: Cyclic references :: +Cyclic referenced objects are not detected. +Set JSONObjectEncoder.recursionMax to suitable value or make sure input object +tree doesn't have cyclic references. + +*/ + +#ifndef __ULTRAJSON_H__ +#define __ULTRAJSON_H__ + +#include +#include + +//#define JSON_DECODE_NUMERIC_AS_DOUBLE + +// Don't output any extra whitespaces when encoding +#define JSON_NO_EXTRA_WHITESPACE + +// Max decimals to encode double floating point numbers with +#ifndef JSON_DOUBLE_MAX_DECIMALS +#define JSON_DOUBLE_MAX_DECIMALS 15 +#endif + +// Max recursion depth, default for encoder +#ifndef JSON_MAX_RECURSION_DEPTH +#define JSON_MAX_RECURSION_DEPTH 1024 +#endif + +/* +Dictates and limits how much stack space for buffers UltraJSON will use before resorting to provided heap functions */ +#ifndef JSON_MAX_STACK_BUFFER_SIZE +#define JSON_MAX_STACK_BUFFER_SIZE 131072 +#endif + +#ifdef _WIN32 + +typedef __int64 JSINT64; +typedef unsigned __int64 JSUINT64; + +typedef unsigned __int32 uint32_t; +typedef __int32 JSINT32; +typedef uint32_t JSUINT32; +typedef unsigned __int8 JSUINT8; +typedef unsigned __int16 JSUTF16; +typedef unsigned __int32 JSUTF32; +typedef __int64 JSLONG; + +#define EXPORTFUNCTION __declspec(dllexport) + +#define FASTCALL_MSVC __fastcall +#define FASTCALL_ATTR +#define INLINE_PREFIX __inline + +#else + +#include +typedef int64_t JSINT64; +typedef u_int64_t JSUINT64; + +typedef int32_t JSINT32; +typedef u_int32_t JSUINT32; + +#define FASTCALL_MSVC +#define FASTCALL_ATTR __attribute__((fastcall)) +#define INLINE_PREFIX inline + +typedef u_int32_t uint32_t; + +typedef u_int8_t JSUINT8; +typedef u_int16_t JSUTF16; +typedef u_int32_t JSUTF32; + +typedef int64_t JSLONG; + +#define EXPORTFUNCTION +#endif + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define __LITTLE_ENDIAN__ +#else + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define __BIG_ENDIAN__ +#endif + +#endif + +#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) +#error "Endianess not supported" +#endif + +enum JSTYPES +{ + JT_NULL, // NULL + JT_TRUE, //boolean true + JT_FALSE, //boolean false + JT_INT, //(JSINT32 (signed 32-bit)) + JT_LONG, //(JSINT64 (signed 64-bit)) + JT_DOUBLE, //(double) + JT_UTF8, //(char 8-bit) + JT_ARRAY, // Array structure + JT_OBJECT, // Key/Value structure + JT_INVALID, // Internal, do not return nor expect +}; + +typedef void * JSOBJ; +typedef void * JSITER; + +typedef struct __JSONTypeContext +{ + int type; + void *encoder; + void *prv[32]; +} JSONTypeContext; + +/* +Function pointer declarations, suitable for implementing UltraJSON */ +typedef void (*JSPFN_ITERBEGIN)(JSOBJ obj, JSONTypeContext *tc); +typedef int (*JSPFN_ITERNEXT)(JSOBJ obj, JSONTypeContext *tc); +typedef void (*JSPFN_ITEREND)(JSOBJ obj, JSONTypeContext *tc); +typedef JSOBJ (*JSPFN_ITERGETVALUE)(JSOBJ obj, JSONTypeContext *tc); +typedef char *(*JSPFN_ITERGETNAME)(JSOBJ obj, JSONTypeContext *tc, size_t *outLen); +typedef void *(*JSPFN_MALLOC)(size_t size); +typedef void (*JSPFN_FREE)(void *pptr); +typedef void *(*JSPFN_REALLOC)(void *base, size_t size); + +typedef struct __JSONObjectEncoder +{ + void (*beginTypeContext)(JSOBJ obj, JSONTypeContext *tc); + void (*endTypeContext)(JSOBJ obj, JSONTypeContext *tc); + const char *(*getStringValue)(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen); + JSINT64 (*getLongValue)(JSOBJ obj, JSONTypeContext *tc); + JSINT32 (*getIntValue)(JSOBJ obj, JSONTypeContext *tc); + double (*getDoubleValue)(JSOBJ obj, JSONTypeContext *tc); + + /* + Begin iteration of an iteratable object (JS_ARRAY or JS_OBJECT) + Implementor should setup iteration state in ti->prv + */ + JSPFN_ITERBEGIN iterBegin; + + /* + Retrieve next object in an iteration. Should return 0 to indicate iteration has reached end or 1 if there are more items. + Implementor is responsible for keeping state of the iteration. Use ti->prv fields for this + */ + JSPFN_ITERNEXT iterNext; + + /* + Ends the iteration of an iteratable object. + Any iteration state stored in ti->prv can be freed here + */ + JSPFN_ITEREND iterEnd; + + /* + Returns a reference to the value object of an iterator + The is responsible for the life-cycle of the returned string. Use iterNext/iterEnd and ti->prv to keep track of current object + */ + JSPFN_ITERGETVALUE iterGetValue; + + /* + Return name of iterator. + The is responsible for the life-cycle of the returned string. Use iterNext/iterEnd and ti->prv to keep track of current object + */ + JSPFN_ITERGETNAME iterGetName; + + /* + Release a value as indicated by setting ti->release = 1 in the previous getValue call. + The ti->prv array should contain the necessary context to release the value + */ + void (*releaseObject)(JSOBJ obj); + + /* Library functions + Set to NULL to use STDLIB malloc,realloc,free */ + JSPFN_MALLOC malloc; + JSPFN_REALLOC realloc; + JSPFN_FREE free; + + /* + Configuration for max recursion, set to 0 to use default (see JSON_MAX_RECURSION_DEPTH)*/ + int recursionMax; + + /* + Configuration for max decimals of double floating poiunt numbers to encode (0-9) */ + int doublePrecision; + + /* + If true output will be ASCII with all characters above 127 encoded as \uXXXX. If false output will be UTF-8 or what ever charset strings are brought as */ + int forceASCII; + + + /* + Set to an error message if error occured */ + const char *errorMsg; + JSOBJ errorObj; + + /* Buffer stuff */ + char *start; + char *offset; + char *end; + int heap; + int level; + +} JSONObjectEncoder; + + +/* +Encode an object structure into JSON. + +Arguments: +obj - An anonymous type representing the object +enc - Function definitions for querying JSOBJ type +buffer - Preallocated buffer to store result in. If NULL function allocates own buffer +cbBuffer - Length of buffer (ignored if buffer is NULL) + +Returns: +Encoded JSON object as a null terminated char string. + +NOTE: +If the supplied buffer wasn't enough to hold the result the function will allocate a new buffer. +Life cycle of the provided buffer must still be handled by caller. + +If the return value doesn't equal the specified buffer caller must release the memory using +JSONObjectEncoder.free or free() as specified when calling this function. +*/ +EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *buffer, size_t cbBuffer); + + + +typedef struct __JSONObjectDecoder +{ + JSOBJ (*newString)(wchar_t *start, wchar_t *end); + int (*objectAddKey)(JSOBJ obj, JSOBJ name, JSOBJ value); + int (*arrayAddItem)(JSOBJ obj, JSOBJ value); + JSOBJ (*newTrue)(void); + JSOBJ (*newFalse)(void); + JSOBJ (*newNull)(void); + JSOBJ (*newObject)(void *decoder); + JSOBJ (*endObject)(JSOBJ obj); + JSOBJ (*newArray)(void *decoder); + JSOBJ (*endArray)(JSOBJ obj); + JSOBJ (*newInt)(JSINT32 value); + JSOBJ (*newLong)(JSINT64 value); + JSOBJ (*newDouble)(double value); + void (*releaseObject)(JSOBJ obj, void *decoder); + JSPFN_MALLOC malloc; + JSPFN_FREE free; + JSPFN_REALLOC realloc; + + char *errorStr; + char *errorOffset; + + + +} JSONObjectDecoder; + +EXPORTFUNCTION JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuffer); + +#endif diff --git a/pandas/src/ujson/lib/ultrajsondec.c b/pandas/src/ujson/lib/ultrajsondec.c new file mode 100644 index 0000000000000..591122be82f92 --- /dev/null +++ b/pandas/src/ujson/lib/ultrajsondec.c @@ -0,0 +1,837 @@ +/* +Copyright (c) 2011, Jonas Tarnstrom and ESN Social Software AB +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +3. All advertising materials mentioning features or use of this software + must display the following acknowledgement: + This product includes software developed by ESN Social Software AB (www.esn.me). +4. Neither the name of the ESN Social Software AB nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY ESN SOCIAL SOFTWARE AB ''AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Portions of code from: +MODP_ASCII - Ascii transformations (upper/lower, etc) +http://code.google.com/p/stringencoders/ +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. + +*/ + +#include "ultrajson.h" +#include +#include +#include +#include +#include + +struct DecoderState +{ + char *start; + char *end; + wchar_t *escStart; + wchar_t *escEnd; + int escHeap; + int lastType; + JSONObjectDecoder *dec; +}; + +JSOBJ FASTCALL_MSVC decode_any( struct DecoderState *ds) FASTCALL_ATTR; +typedef JSOBJ (*PFN_DECODER)( struct DecoderState *ds); +#define RETURN_JSOBJ_NULLCHECK(_expr) return(_expr); + +double createDouble(double intNeg, double intValue, double frcValue, int frcDecimalCount) +{ + static const double g_pow10[] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000, 10000000000, 100000000000, 1000000000000, 10000000000000, 100000000000000, 1000000000000000}; + + return (intValue + (frcValue / g_pow10[frcDecimalCount])) * intNeg; +} + +static JSOBJ SetError( struct DecoderState *ds, int offset, const char *message) +{ + ds->dec->errorOffset = ds->start + offset; + ds->dec->errorStr = (char *) message; + return NULL; +} + + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric ( struct DecoderState *ds) +{ +#ifdef JSON_DECODE_NUMERIC_AS_DOUBLE + double intNeg = 1; + double intValue; +#else + int intNeg = 1; + JSLONG intValue; +#endif + + double expNeg; + int chr; + int decimalCount = 0; + double frcValue = 0.0; + double expValue; + char *offset = ds->start; + + if (*(offset) == '-') + { + offset ++; + intNeg = -1; + } + + // Scan integer part + intValue = 0; + + while (1) + { + chr = (int) (unsigned char) *(offset); + + switch (chr) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + //FIXME: Check for arithemtic overflow here + //PERF: Don't do 64-bit arithmetic here unless we know we have to +#ifdef JSON_DECODE_NUMERIC_AS_DOUBLE + intValue = intValue * 10.0 + (double) (chr - 48); +#else + intValue = intValue * 10LL + (JSLONG) (chr - 48); +#endif + offset ++; + break; + + case '.': + offset ++; + goto DECODE_FRACTION; + break; + + case 'e': + case 'E': + offset ++; + goto DECODE_EXPONENT; + break; + + default: + goto BREAK_INT_LOOP; + break; + } + } + +BREAK_INT_LOOP: + + ds->lastType = JT_INT; + ds->start = offset; + + //If input string is LONGLONG_MIN here the value is already negative so we should not flip it + +#ifdef JSON_DECODE_NUMERIC_AS_DOUBLE +#else + if (intValue < 0) + { + intNeg = 1; + } +#endif + + //dbg1 = (intValue * intNeg); + //dbg2 = (JSLONG) dbg1; + +#ifdef JSON_DECODE_NUMERIC_AS_DOUBLE + if (intValue > (double) INT_MAX || intValue < (double) INT_MIN) +#else + if ( (intValue >> 31)) +#endif + { + RETURN_JSOBJ_NULLCHECK(ds->dec->newLong( (JSINT64) (intValue * (JSINT64) intNeg))); + } + else + { + RETURN_JSOBJ_NULLCHECK(ds->dec->newInt( (JSINT32) (intValue * intNeg))); + } + + + +DECODE_FRACTION: + + // Scan fraction part + frcValue = 0.0; + while (1) + { + chr = (int) (unsigned char) *(offset); + + switch (chr) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if (decimalCount < JSON_DOUBLE_MAX_DECIMALS) + { + frcValue = frcValue * 10.0 + (double) (chr - 48); + decimalCount ++; + } + offset ++; + break; + + case 'e': + case 'E': + offset ++; + goto DECODE_EXPONENT; + break; + + default: + goto BREAK_FRC_LOOP; + } + } + +BREAK_FRC_LOOP: + + if (intValue < 0) + { + intNeg = 1; + } + + //FIXME: Check for arithemtic overflow here + ds->lastType = JT_DOUBLE; + ds->start = offset; + RETURN_JSOBJ_NULLCHECK(ds->dec->newDouble (createDouble( (double) intNeg, (double) intValue, frcValue, decimalCount))); + +DECODE_EXPONENT: + expNeg = 1.0; + + if (*(offset) == '-') + { + expNeg = -1.0; + offset ++; + } + else + if (*(offset) == '+') + { + expNeg = +1.0; + offset ++; + } + + expValue = 0.0; + + while (1) + { + chr = (int) (unsigned char) *(offset); + + switch (chr) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + expValue = expValue * 10.0 + (double) (chr - 48); + offset ++; + break; + + default: + goto BREAK_EXP_LOOP; + + } + } + +BREAK_EXP_LOOP: + +#ifdef JSON_DECODE_NUMERIC_AS_DOUBLE +#else + if (intValue < 0) + { + intNeg = 1; + } +#endif + + //FIXME: Check for arithemtic overflow here + ds->lastType = JT_DOUBLE; + ds->start = offset; + RETURN_JSOBJ_NULLCHECK(ds->dec->newDouble (createDouble( (double) intNeg, (double) intValue , frcValue, decimalCount) * pow(10.0, expValue * expNeg))); +} + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_true ( struct DecoderState *ds) +{ + char *offset = ds->start; + offset ++; + + if (*(offset++) != 'r') + goto SETERROR; + if (*(offset++) != 'u') + goto SETERROR; + if (*(offset++) != 'e') + goto SETERROR; + + ds->lastType = JT_TRUE; + ds->start = offset; + RETURN_JSOBJ_NULLCHECK(ds->dec->newTrue()); + +SETERROR: + return SetError(ds, -1, "Unexpected character found when decoding 'true'"); +} + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_false ( struct DecoderState *ds) +{ + char *offset = ds->start; + offset ++; + + if (*(offset++) != 'a') + goto SETERROR; + if (*(offset++) != 'l') + goto SETERROR; + if (*(offset++) != 's') + goto SETERROR; + if (*(offset++) != 'e') + goto SETERROR; + + ds->lastType = JT_FALSE; + ds->start = offset; + RETURN_JSOBJ_NULLCHECK(ds->dec->newFalse()); + +SETERROR: + return SetError(ds, -1, "Unexpected character found when decoding 'false'"); + +} + + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_null ( struct DecoderState *ds) +{ + char *offset = ds->start; + offset ++; + + if (*(offset++) != 'u') + goto SETERROR; + if (*(offset++) != 'l') + goto SETERROR; + if (*(offset++) != 'l') + goto SETERROR; + + ds->lastType = JT_NULL; + ds->start = offset; + RETURN_JSOBJ_NULLCHECK(ds->dec->newNull()); + +SETERROR: + return SetError(ds, -1, "Unexpected character found when decoding 'null'"); +} + +FASTCALL_ATTR void FASTCALL_MSVC SkipWhitespace(struct DecoderState *ds) +{ + char *offset = ds->start; + + while (1) + { + switch (*offset) + { + case ' ': + case '\t': + case '\r': + case '\n': + offset ++; + break; + + default: + ds->start = offset; + return; + } + } +} + + +enum DECODESTRINGSTATE +{ + DS_ISNULL = 0x32, + DS_ISQUOTE, + DS_ISESCAPE, + DS_UTFLENERROR, + +}; + +static const JSUINT8 g_decoderLookup[256] = +{ +/* 0x00 */ DS_ISNULL, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x10 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x20 */ 1, 1, DS_ISQUOTE, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x30 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x40 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x50 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, DS_ISESCAPE, 1, 1, 1, +/* 0x60 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x70 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x80 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x90 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0xa0 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0xb0 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0xc0 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* 0xd0 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* 0xe0 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, +/* 0xf0 */ 4, 4, 4, 4, 4, 4, 4, 4, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, +}; + + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds) +{ + JSUTF16 sur[2] = { 0 }; + int iSur = 0; + int index; + wchar_t *escOffset; + size_t escLen = (ds->escEnd - ds->escStart); + JSUINT8 *inputOffset; + JSUINT8 oct; + JSUTF32 ucs; + ds->lastType = JT_INVALID; + ds->start ++; + + if ( (ds->end - ds->start) > escLen) + { + size_t newSize = (ds->end - ds->start); + + if (ds->escHeap) + { + ds->escStart = (wchar_t *) ds->dec->realloc (ds->escStart, newSize * sizeof(wchar_t)); + } + else + { + wchar_t *oldStart = ds->escStart; + ds->escHeap = 1; + ds->escStart = (wchar_t *) ds->dec->malloc (newSize * sizeof(wchar_t)); + memcpy (ds->escStart, oldStart, escLen * sizeof(wchar_t)); + } + + ds->escEnd = ds->escStart + newSize; + } + + escOffset = ds->escStart; + inputOffset = ds->start; + + while(1) + { + switch (g_decoderLookup[(JSUINT8)(*inputOffset)]) + { + case DS_ISNULL: + return SetError(ds, -1, "Unmatched ''\"' when when decoding 'string'"); + + case DS_ISQUOTE: + ds->lastType = JT_UTF8; + inputOffset ++; + ds->start += ( (char *) inputOffset - (ds->start)); + RETURN_JSOBJ_NULLCHECK(ds->dec->newString(ds->escStart, escOffset)); + + case DS_UTFLENERROR: + return SetError (ds, -1, "Invalid UTF-8 sequence length when decoding 'string'"); + + case DS_ISESCAPE: + inputOffset ++; + switch (*inputOffset) + { + case '\\': *(escOffset++) = L'\\'; inputOffset++; continue; + case '\"': *(escOffset++) = L'\"'; inputOffset++; continue; + case '/': *(escOffset++) = L'/'; inputOffset++; continue; + case 'b': *(escOffset++) = L'\b'; inputOffset++; continue; + case 'f': *(escOffset++) = L'\f'; inputOffset++; continue; + case 'n': *(escOffset++) = L'\n'; inputOffset++; continue; + case 'r': *(escOffset++) = L'\r'; inputOffset++; continue; + case 't': *(escOffset++) = L'\t'; inputOffset++; continue; + + case 'u': + { + int index; + inputOffset ++; + + for (index = 0; index < 4; index ++) + { + switch (*inputOffset) + { + case '\0': return SetError (ds, -1, "Unterminated unicode escape sequence when decoding 'string'"); + default: return SetError (ds, -1, "Unexpected character in unicode escape sequence when decoding 'string'"); + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + sur[iSur] = (sur[iSur] << 4) + (JSUTF16) (*inputOffset - '0'); + break; + + case 'a': + case 'b': + case 'c': + case 'd': + case 'e': + case 'f': + sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'a'); + break; + + case 'A': + case 'B': + case 'C': + case 'D': + case 'E': + case 'F': + sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'A'); + break; + } + + inputOffset ++; + } + + + if (iSur == 0) + { + if((sur[iSur] & 0xfc00) == 0xd800) + { + // First of a surrogate pair, continue parsing + iSur ++; + break; + } + (*escOffset++) = (wchar_t) sur[iSur]; + iSur = 0; + } + else + { + // Decode pair + if ((sur[1] & 0xfc00) != 0xdc00) + { + return SetError (ds, -1, "Unpaired high surrogate when decoding 'string'"); + } + +#if WCHAR_MAX == 0xffff + (*escOffset++) = (wchar_t) sur[0]; + (*escOffset++) = (wchar_t) sur[1]; +#else + (*escOffset++) = (wchar_t) 0x10000 + (((sur[0] - 0xd800) << 10) | (sur[1] - 0xdc00)); +#endif + iSur = 0; + } + break; + } + + case '\0': return SetError(ds, -1, "Unterminated escape sequence when decoding 'string'"); + default: return SetError(ds, -1, "Unrecognized escape sequence when decoding 'string'"); + } + break; + + case 1: + *(escOffset++) = (wchar_t) (*inputOffset++); + break; + + case 2: + { + ucs = (*inputOffset++) & 0x1f; + ucs <<= 6; + if (((*inputOffset) & 0x80) != 0x80) + { + return SetError(ds, -1, "Invalid octet in UTF-8 sequence when decoding 'string'"); + } + ucs |= (*inputOffset++) & 0x3f; + if (ucs < 0x80) return SetError (ds, -1, "Overlong 2 byte UTF-8 sequence detected when decoding 'string'"); + *(escOffset++) = (wchar_t) ucs; + break; + } + + case 3: + { + JSUTF32 ucs = 0; + ucs |= (*inputOffset++) & 0x0f; + + for (index = 0; index < 2; index ++) + { + ucs <<= 6; + oct = (*inputOffset++); + + if ((oct & 0x80) != 0x80) + { + return SetError(ds, -1, "Invalid octet in UTF-8 sequence when decoding 'string'"); + } + + ucs |= oct & 0x3f; + } + + if (ucs < 0x800) return SetError (ds, -1, "Overlong 3 byte UTF-8 sequence detected when encoding string"); + *(escOffset++) = (wchar_t) ucs; + break; + } + + case 4: + { + JSUTF32 ucs = 0; + ucs |= (*inputOffset++) & 0x07; + + for (index = 0; index < 3; index ++) + { + ucs <<= 6; + oct = (*inputOffset++); + + if ((oct & 0x80) != 0x80) + { + return SetError(ds, -1, "Invalid octet in UTF-8 sequence when decoding 'string'"); + } + + ucs |= oct & 0x3f; + } + + if (ucs < 0x10000) return SetError (ds, -1, "Overlong 4 byte UTF-8 sequence detected when decoding 'string'"); + + #if WCHAR_MAX == 0xffff + if (ucs >= 0x10000) + { + ucs -= 0x10000; + *(escOffset++) = (ucs >> 10) + 0xd800; + *(escOffset++) = (ucs & 0x3ff) + 0xdc00; + } + else + { + *(escOffset++) = (wchar_t) ucs; + } + #else + *(escOffset++) = (wchar_t) ucs; + #endif + break; + } + } + } +} + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_array( struct DecoderState *ds) +{ + JSOBJ itemValue; + JSOBJ newObj = ds->dec->newArray(ds->dec); + + ds->lastType = JT_INVALID; + ds->start ++; + + while (1)//(*ds->start) != '\0') + { + SkipWhitespace(ds); + + if ((*ds->start) == ']') + { + ds->start++; + return ds->dec->endArray(newObj); + } + + itemValue = decode_any(ds); + + if (itemValue == NULL) + { + ds->dec->releaseObject(newObj, ds->dec); + return NULL; + } + + if (!ds->dec->arrayAddItem (newObj, itemValue)) + { + ds->dec->releaseObject(newObj, ds->dec); + return NULL; + } + + SkipWhitespace(ds); + + switch (*(ds->start++)) + { + case ']': + return ds->dec->endArray(newObj); + + case ',': + break; + + default: + ds->dec->releaseObject(newObj, ds->dec); + return SetError(ds, -1, "Unexpected character in found when decoding array value"); + } + } + + ds->dec->releaseObject(newObj, ds->dec); + return SetError(ds, -1, "Unmatched ']' when decoding 'array'"); +} + + + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_object( struct DecoderState *ds) +{ + JSOBJ itemName; + JSOBJ itemValue; + JSOBJ newObj = ds->dec->newObject(ds->dec); + + ds->start ++; + + while (1) + { + SkipWhitespace(ds); + + if ((*ds->start) == '}') + { + ds->start ++; + return ds->dec->endObject(newObj); + } + + ds->lastType = JT_INVALID; + itemName = decode_any(ds); + + if (itemName == NULL) + { + ds->dec->releaseObject(newObj, ds->dec); + return NULL; + } + + if (ds->lastType != JT_UTF8) + { + ds->dec->releaseObject(newObj, ds->dec); + ds->dec->releaseObject(itemName, ds->dec); + return SetError(ds, -1, "Key name of object must be 'string' when decoding 'object'"); + } + + SkipWhitespace(ds); + + if (*(ds->start++) != ':') + { + ds->dec->releaseObject(newObj, ds->dec); + ds->dec->releaseObject(itemName, ds->dec); + return SetError(ds, -1, "No ':' found when decoding object value"); + } + + SkipWhitespace(ds); + + itemValue = decode_any(ds); + + if (itemValue == NULL) + { + ds->dec->releaseObject(newObj, ds->dec); + ds->dec->releaseObject(itemName, ds->dec); + return NULL; + } + + if (!ds->dec->objectAddKey (newObj, itemName, itemValue)) + { + ds->dec->releaseObject(newObj, ds->dec); + ds->dec->releaseObject(itemName, ds->dec); + ds->dec->releaseObject(itemValue, ds->dec); + return NULL; + } + + SkipWhitespace(ds); + + switch (*(ds->start++)) + { + case '}': + return ds->dec->endObject(newObj); + + case ',': + break; + + default: + ds->dec->releaseObject(newObj, ds->dec); + return SetError(ds, -1, "Unexpected character in found when decoding object value"); + } + } + + ds->dec->releaseObject(newObj, ds->dec); + return SetError(ds, -1, "Unmatched '}' when decoding object"); +} + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds) +{ + while (1) + { + switch (*ds->start) + { + case '\"': + return decode_string (ds); + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + case '-': + return decode_numeric (ds); + + case '[': return decode_array (ds); + case '{': return decode_object (ds); + case 't': return decode_true (ds); + case 'f': return decode_false (ds); + case 'n': return decode_null (ds); + + case ' ': + case '\t': + case '\r': + case '\n': + // White space + ds->start ++; + break; + + default: + return SetError(ds, -1, "Expected object or value"); + } + } +} + + +JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuffer) +{ + + /* + FIXME: Base the size of escBuffer of that of cbBuffer so that the unicode escaping doesn't run into the wall each time */ + struct DecoderState ds; + wchar_t escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t))]; + JSOBJ ret; + + ds.start = (char *) buffer; + ds.end = ds.start + cbBuffer; + + ds.escStart = escBuffer; + ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t)); + ds.escHeap = 0; + ds.dec = dec; + ds.dec->errorStr = NULL; + ds.dec->errorOffset = NULL; + + ds.dec = dec; + + ret = decode_any (&ds); + + if (ds.escHeap) + { + dec->free(ds.escStart); + } + return ret; +} diff --git a/pandas/src/ujson/lib/ultrajsonenc.c b/pandas/src/ujson/lib/ultrajsonenc.c new file mode 100644 index 0000000000000..594bef253b2f6 --- /dev/null +++ b/pandas/src/ujson/lib/ultrajsonenc.c @@ -0,0 +1,858 @@ +/* +Copyright (c) 2011, Jonas Tarnstrom and ESN Social Software AB +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +3. All advertising materials mentioning features or use of this software + must display the following acknowledgement: + This product includes software developed by ESN Social Software AB (www.esn.me). +4. Neither the name of the ESN Social Software AB nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY ESN SOCIAL SOFTWARE AB ''AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Portions of code from: +MODP_ASCII - Ascii transformations (upper/lower, etc) +http://code.google.com/p/stringencoders/ +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. + +*/ + +#include "ultrajson.h" +#include +#include +#include +#include +#include + +#include + +#ifndef TRUE +#define TRUE 1 +#endif +#ifndef FALSE +#define FALSE 0 +#endif + +static const double g_pow10[] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000, 10000000000, 100000000000, 1000000000000, 10000000000000, 100000000000000, 1000000000000000}; +static const char g_hexChars[] = "0123456789abcdef"; +static const char g_escapeChars[] = "0123456789\\b\\t\\n\\f\\r\\\"\\\\\\/"; + + +/* +FIXME: While this is fine dandy and working it's a magic value mess which probably only the author understands. +Needs a cleanup and more documentation */ + +/* +Table for pure ascii output escaping all characters above 127 to \uXXXX */ +static const JSUINT8 g_asciiOutputTable[256] = +{ +/* 0x00 */ 0, 30, 30, 30, 30, 30, 30, 30, 10, 12, 14, 30, 16, 18, 30, 30, +/* 0x10 */ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, +/* 0x20 */ 1, 1, 20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 24, +/* 0x30 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x40 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x50 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 22, 1, 1, 1, +/* 0x60 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x70 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x80 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x90 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0xa0 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0xb0 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0xc0 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* 0xd0 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* 0xe0 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, +/* 0xf0 */ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1 +}; + + +static void SetError (JSOBJ obj, JSONObjectEncoder *enc, const char *message) +{ + enc->errorMsg = message; + enc->errorObj = obj; +} + +/* +FIXME: Keep track of how big these get across several encoder calls and try to make an estimate +That way we won't run our head into the wall each call */ +void Buffer_Realloc (JSONObjectEncoder *enc, size_t cbNeeded) +{ + size_t curSize = enc->end - enc->start; + size_t newSize = curSize * 2; + size_t offset = enc->offset - enc->start; + + while (newSize < curSize + cbNeeded) + { + newSize *= 2; + } + + if (enc->heap) + { + enc->start = (char *) enc->realloc (enc->start, newSize); + } + else + { + char *oldStart = enc->start; + enc->heap = 1; + enc->start = (char *) enc->malloc (newSize); + memcpy (enc->start, oldStart, offset); + } + enc->offset = enc->start + offset; + enc->end = enc->start + newSize; +} + +FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC Buffer_AppendShortHexUnchecked (char *outputOffset, unsigned short value) +{ + *(outputOffset++) = g_hexChars[(value & 0xf000) >> 12]; + *(outputOffset++) = g_hexChars[(value & 0x0f00) >> 8]; + *(outputOffset++) = g_hexChars[(value & 0x00f0) >> 4]; + *(outputOffset++) = g_hexChars[(value & 0x000f) >> 0]; +} + +int Buffer_EscapeStringUnvalidated (JSOBJ obj, JSONObjectEncoder *enc, const char *io, const char *end) +{ + char *of = (char *) enc->offset; + + while (1) + { + switch (*io) + { + case 0x00: + if (io < end) + { + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + break; + } + else + { + enc->offset += (of - enc->offset); + return TRUE; + } + + case '\"': (*of++) = '\\'; (*of++) = '\"'; break; + case '\\': (*of++) = '\\'; (*of++) = '\\'; break; + case '/': (*of++) = '\\'; (*of++) = '/'; break; + case '\b': (*of++) = '\\'; (*of++) = 'b'; break; + case '\f': (*of++) = '\\'; (*of++) = 'f'; break; + case '\n': (*of++) = '\\'; (*of++) = 'n'; break; + case '\r': (*of++) = '\\'; (*of++) = 'r'; break; + case '\t': (*of++) = '\\'; (*of++) = 't'; break; + + case 0x01: + case 0x02: + case 0x03: + case 0x04: + case 0x05: + case 0x06: + case 0x07: + case 0x0b: + case 0x0e: + case 0x0f: + case 0x10: + case 0x11: + case 0x12: + case 0x13: + case 0x14: + case 0x15: + case 0x16: + case 0x17: + case 0x18: + case 0x19: + case 0x1a: + case 0x1b: + case 0x1c: + case 0x1d: + case 0x1e: + case 0x1f: + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = g_hexChars[ (unsigned char) (((*io) & 0xf0) >> 4)]; + *(of++) = g_hexChars[ (unsigned char) ((*io) & 0x0f)]; + break; + + default: (*of++) = (*io); break; + } + + io++; + } + + return FALSE; +} + + +/* +FIXME: +This code only works with Little and Big Endian + +FIXME: The JSON spec says escape "/" but non of the others do and we don't +want to be left alone doing it so we don't :) + +*/ +int Buffer_EscapeStringValidated (JSOBJ obj, JSONObjectEncoder *enc, const char *io, const char *end) +{ + JSUTF32 ucs; + char *of = (char *) enc->offset; + + while (1) + { + + //JSUINT8 chr = (unsigned char) *io; + JSUINT8 utflen = g_asciiOutputTable[(unsigned char) *io]; + + switch (utflen) + { + case 0: + { + if (io < end) + { + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + io ++; + continue; + } + else + { + enc->offset += (of - enc->offset); + return TRUE; + } + } + + case 1: + { + *(of++)= (*io++); + continue; + } + + case 2: + { + JSUTF32 in; + + if (io + 1 > end) + { + enc->offset += (of - enc->offset); + SetError (obj, enc, "Unterminated UTF-8 sequence when encoding string"); + return FALSE; + } + + in = *((JSUTF16 *) io); + +#ifdef __LITTLE_ENDIAN__ + ucs = ((in & 0x1f) << 6) | ((in >> 8) & 0x3f); +#else + ucs = ((in & 0x1f00) >> 2) | (in & 0x3f); +#endif + + if (ucs < 0x80) + { + enc->offset += (of - enc->offset); + SetError (obj, enc, "Overlong 2 byte UTF-8 sequence detected when encoding string"); + return FALSE; + } + + io += 2; + break; + } + + case 3: + { + JSUTF32 in; + + if (io + 2 > end) + { + enc->offset += (of - enc->offset); + SetError (obj, enc, "Unterminated UTF-8 sequence when encoding string"); + return FALSE; + } + +#ifdef __LITTLE_ENDIAN__ + in = *((JSUTF16 *) io); + in |= *((JSUINT8 *) io + 2) << 16; + ucs = ((in & 0x0f) << 12) | ((in & 0x3f00) >> 2) | ((in & 0x3f0000) >> 16); +#else + in = *((JSUTF16 *) io) << 8; + in |= *((JSUINT8 *) io + 2); + ucs = ((in & 0x0f0000) >> 4) | ((in & 0x3f00) >> 2) | (in & 0x3f); +#endif + + + if (ucs < 0x800) + { + enc->offset += (of - enc->offset); + SetError (obj, enc, "Overlong 3 byte UTF-8 sequence detected when encoding string"); + return FALSE; + } + + io += 3; + break; + } + case 4: + { + JSUTF32 in; + + if (io + 3 > end) + { + enc->offset += (of - enc->offset); + SetError (obj, enc, "Unterminated UTF-8 sequence when encoding string"); + return FALSE; + } + +#ifdef __LITTLE_ENDIAN__ + in = *((JSUTF32 *) io); + ucs = ((in & 0x07) << 18) | ((in & 0x3f00) << 4) | ((in & 0x3f0000) >> 10) | ((in & 0x3f000000) >> 24); +#else + in = *((JSUTF32 *) io); + ucs = ((in & 0x07000000) >> 6) | ((in & 0x3f0000) >> 4) | ((in & 0x3f00) >> 2) | (in & 0x3f); +#endif + if (ucs < 0x10000) + { + enc->offset += (of - enc->offset); + SetError (obj, enc, "Overlong 4 byte UTF-8 sequence detected when encoding string"); + return FALSE; + } + + io += 4; + break; + } + + + case 5: + case 6: + enc->offset += (of - enc->offset); + SetError (obj, enc, "Unsupported UTF-8 sequence length when encoding string"); + return FALSE; + + case 30: + // \uXXXX encode + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = g_hexChars[ (unsigned char) (((*io) & 0xf0) >> 4)]; + *(of++) = g_hexChars[ (unsigned char) ((*io) & 0x0f)]; + io ++; + continue; + + case 10: + case 12: + case 14: + case 16: + case 18: + case 20: + case 22: + case 24: + *(of++) = *( (char *) (g_escapeChars + utflen + 0)); + *(of++) = *( (char *) (g_escapeChars + utflen + 1)); + io ++; + continue; + } + + /* + If the character is a UTF8 sequence of length > 1 we end up here */ + if (ucs >= 0x10000) + { + ucs -= 0x10000; + *(of++) = '\\'; + *(of++) = 'u'; + Buffer_AppendShortHexUnchecked(of, (ucs >> 10) + 0xd800); + of += 4; + + *(of++) = '\\'; + *(of++) = 'u'; + Buffer_AppendShortHexUnchecked(of, (ucs & 0x3ff) + 0xdc00); + of += 4; + } + else + { + *(of++) = '\\'; + *(of++) = 'u'; + Buffer_AppendShortHexUnchecked(of, ucs); + of += 4; + } + } + + return FALSE; +} + +#define Buffer_Reserve(__enc, __len) \ + if ((__enc)->offset + (__len) > (__enc)->end) \ + { \ + Buffer_Realloc((__enc), (__len));\ + } \ + + +#define Buffer_AppendCharUnchecked(__enc, __chr) \ + *((__enc)->offset++) = __chr; \ + +FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC strreverse(char* begin, char* end) +{ + char aux; + while (end > begin) + aux = *end, *end-- = *begin, *begin++ = aux; +} + +void Buffer_AppendIntUnchecked(JSONObjectEncoder *enc, JSINT32 value) +{ + char* wstr; + JSUINT32 uvalue = (value < 0) ? -value : value; + + wstr = enc->offset; + // Conversion. Number is reversed. + + do *wstr++ = (char)(48 + (uvalue % 10)); while(uvalue /= 10); + if (value < 0) *wstr++ = '-'; + + // Reverse string + strreverse(enc->offset,wstr - 1); + enc->offset += (wstr - (enc->offset)); +} + +void Buffer_AppendLongUnchecked(JSONObjectEncoder *enc, JSINT64 value) +{ + char* wstr; + JSUINT64 uvalue = (value < 0) ? -value : value; + + wstr = enc->offset; + // Conversion. Number is reversed. + + do *wstr++ = (char)(48 + (uvalue % 10ULL)); while(uvalue /= 10ULL); + if (value < 0) *wstr++ = '-'; + + // Reverse string + strreverse(enc->offset,wstr - 1); + enc->offset += (wstr - (enc->offset)); +} + +int Buffer_AppendDoubleUnchecked(JSOBJ obj, JSONObjectEncoder *enc, double value) +{ + /* if input is larger than thres_max, revert to exponential */ + const double thres_max = (double) 1e16 - 1; + int count; + double diff = 0.0; + char* str = enc->offset; + char* wstr = str; + unsigned long whole; + double tmp; + unsigned long frac; + int neg; + double pow10; + + if (value == HUGE_VAL || value == -HUGE_VAL) + { + SetError (obj, enc, "Invalid Inf value when encoding double"); + return FALSE; + } + if (! (value == value)) + { + SetError (obj, enc, "Invalid Nan value when encoding double"); + return FALSE; + } + + + /* we'll work in positive values and deal with the + negative sign issue later */ + neg = 0; + if (value < 0) + { + neg = 1; + value = -value; + } + + pow10 = g_pow10[enc->doublePrecision]; + + whole = (unsigned long) value; + tmp = (value - whole) * pow10; + frac = (unsigned long)(tmp); + diff = tmp - frac; + + if (diff > 0.5) + { + ++frac; + /* handle rollover, e.g. case 0.99 with prec 1 is 1.0 */ + if (frac >= pow10) + { + frac = 0; + ++whole; + } + } + else + if (diff == 0.5 && ((frac == 0) || (frac & 1))) + { + /* if halfway, round up if odd, OR + if last digit is 0. That last part is strange */ + ++frac; + } + + /* for very large numbers switch back to native sprintf for exponentials. + anyone want to write code to replace this? */ + /* + normal printf behavior is to print EVERY whole number digit + which can be 100s of characters overflowing your buffers == bad + */ + if (value > thres_max) + { + enc->offset += sprintf(str, "%.15e", neg ? -value : value); + return TRUE; + } + + if (enc->doublePrecision == 0) + { + diff = value - whole; + + if (diff > 0.5) + { + /* greater than 0.5, round up, e.g. 1.6 -> 2 */ + ++whole; + } + else + if (diff == 0.5 && (whole & 1)) + { + /* exactly 0.5 and ODD, then round up */ + /* 1.5 -> 2, but 2.5 -> 2 */ + ++whole; + } + + //vvvvvvvvvvvvvvvvvvv Diff from modp_dto2 + } + else + if (frac) + { + count = enc->doublePrecision; + // now do fractional part, as an unsigned number + // we know it is not 0 but we can have leading zeros, these + // should be removed + while (!(frac % 10)) + { + --count; + frac /= 10; + } + //^^^^^^^^^^^^^^^^^^^ Diff from modp_dto2 + + // now do fractional part, as an unsigned number + do + { + --count; + *wstr++ = (char)(48 + (frac % 10)); + } while (frac /= 10); + // add extra 0s + while (count-- > 0) + { + *wstr++ = '0'; + } + // add decimal + *wstr++ = '.'; + } + else + { + *wstr++ = '0'; + *wstr++ = '.'; + } + + // do whole part + // Take care of sign + // Conversion. Number is reversed. + do *wstr++ = (char)(48 + (whole % 10)); while (whole /= 10); + + if (neg) + { + *wstr++ = '-'; + } + strreverse(str, wstr-1); + enc->offset += (wstr - (enc->offset)); + + return TRUE; +} + + + + + + +/* +FIXME: +Handle integration functions returning NULL here */ + +/* +FIXME: +Perhaps implement recursion detection */ + +void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t cbName) +{ + JSONTypeContext tc; + tc.encoder = enc; + size_t szlen; + + if (enc->level > enc->recursionMax) + { + SetError (obj, enc, "Maximum recursion level reached"); + return; + } + + /* + This reservation must hold + + length of _name as encoded worst case + + maxLength of double to string OR maxLength of JSLONG to string + + Since input is assumed to be UTF-8 the worst character length is: + + 4 bytes (of UTF-8) => "\uXXXX\uXXXX" (12 bytes) + */ + + Buffer_Reserve(enc, 256 + (((cbName / 4) + 1) * 12)); + + if (name) + { + Buffer_AppendCharUnchecked(enc, '\"'); + + if (enc->forceASCII) + { + if (!Buffer_EscapeStringValidated(obj, enc, name, name + cbName)) + { + return; + } + } + else + { + if (!Buffer_EscapeStringUnvalidated(obj, enc, name, name + cbName)) + { + return; + } + } + + + Buffer_AppendCharUnchecked(enc, '\"'); + + Buffer_AppendCharUnchecked (enc, ':'); +#ifndef JSON_NO_EXTRA_WHITESPACE + Buffer_AppendCharUnchecked (enc, ' '); +#endif + } + + enc->beginTypeContext(obj, &tc); + + switch (tc.type) + { + case JT_INVALID: + return; + + case JT_ARRAY: + { + int count = 0; + JSOBJ iterObj; + enc->iterBegin(obj, &tc); + + Buffer_AppendCharUnchecked (enc, '['); + + while (enc->iterNext(obj, &tc)) + { + if (count > 0) + { + Buffer_AppendCharUnchecked (enc, ','); +#ifndef JSON_NO_EXTRA_WHITESPACE + Buffer_AppendCharUnchecked (buffer, ' '); +#endif + } + + iterObj = enc->iterGetValue(obj, &tc); + + enc->level ++; + encode (iterObj, enc, NULL, 0); + count ++; + } + + enc->iterEnd(obj, &tc); + Buffer_AppendCharUnchecked (enc, ']'); + break; + } + + case JT_OBJECT: + { + int count = 0; + JSOBJ iterObj; + char *objName; + + enc->iterBegin(obj, &tc); + + Buffer_AppendCharUnchecked (enc, '{'); + + while (enc->iterNext(obj, &tc)) + { + if (count > 0) + { + Buffer_AppendCharUnchecked (enc, ','); +#ifndef JSON_NO_EXTRA_WHITESPACE + Buffer_AppendCharUnchecked (enc, ' '); +#endif + } + + iterObj = enc->iterGetValue(obj, &tc); + objName = enc->iterGetName(obj, &tc, &szlen); + + enc->level ++; + encode (iterObj, enc, objName, szlen); + count ++; + } + + enc->iterEnd(obj, &tc); + Buffer_AppendCharUnchecked (enc, '}'); + break; + } + + case JT_LONG: + { + Buffer_AppendLongUnchecked (enc, enc->getLongValue(obj, &tc)); + break; + } + + case JT_INT: + { + Buffer_AppendIntUnchecked (enc, enc->getIntValue(obj, &tc)); + break; + } + + case JT_TRUE: + { + Buffer_AppendCharUnchecked (enc, 't'); + Buffer_AppendCharUnchecked (enc, 'r'); + Buffer_AppendCharUnchecked (enc, 'u'); + Buffer_AppendCharUnchecked (enc, 'e'); + break; + } + + case JT_FALSE: + { + Buffer_AppendCharUnchecked (enc, 'f'); + Buffer_AppendCharUnchecked (enc, 'a'); + Buffer_AppendCharUnchecked (enc, 'l'); + Buffer_AppendCharUnchecked (enc, 's'); + Buffer_AppendCharUnchecked (enc, 'e'); + break; + } + + + case JT_NULL: + { + Buffer_AppendCharUnchecked (enc, 'n'); + Buffer_AppendCharUnchecked (enc, 'u'); + Buffer_AppendCharUnchecked (enc, 'l'); + Buffer_AppendCharUnchecked (enc, 'l'); + break; + } + + case JT_DOUBLE: + { + if (!Buffer_AppendDoubleUnchecked (obj, enc, enc->getDoubleValue(obj, &tc))) + { + enc->endTypeContext(obj, &tc); + enc->level --; + return; + } + break; + } + + case JT_UTF8: + { + const char *value = enc->getStringValue(obj, &tc, &szlen); + Buffer_Reserve(enc, ((szlen / 4) + 1) * 12); + Buffer_AppendCharUnchecked (enc, '\"'); + + + if (enc->forceASCII) + { + if (!Buffer_EscapeStringValidated(obj, enc, value, value + szlen)) + { + enc->endTypeContext(obj, &tc); + enc->level --; + return; + } + } + else + { + if (!Buffer_EscapeStringUnvalidated(obj, enc, value, value + szlen)) + { + enc->endTypeContext(obj, &tc); + enc->level --; + return; + } + } + + Buffer_AppendCharUnchecked (enc, '\"'); + break; + } + } + + enc->endTypeContext(obj, &tc); + enc->level --; + +} + +char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, size_t _cbBuffer) +{ + enc->malloc = enc->malloc ? enc->malloc : malloc; + enc->free = enc->free ? enc->free : free; + enc->realloc = enc->realloc ? enc->realloc : realloc; + enc->errorMsg = NULL; + enc->errorObj = NULL; + enc->level = 0; + + if (enc->recursionMax < 1) + { + enc->recursionMax = JSON_MAX_RECURSION_DEPTH; + } + + if (enc->doublePrecision < 0 || + enc->doublePrecision > JSON_DOUBLE_MAX_DECIMALS) + { + enc->doublePrecision = JSON_DOUBLE_MAX_DECIMALS; + } + + if (_buffer == NULL) + { + _cbBuffer = 32768; + enc->start = (char *) enc->malloc (_cbBuffer); + enc->heap = 1; + } + else + { + enc->start = _buffer; + enc->heap = 0; + } + + enc->end = enc->start + _cbBuffer; + enc->offset = enc->start; + + + encode (obj, enc, NULL, 0); + + Buffer_Reserve(enc, 1); + Buffer_AppendCharUnchecked(enc, '\0'); + + return enc->start; +} diff --git a/pandas/src/ujson/python/JSONtoObj.c b/pandas/src/ujson/python/JSONtoObj.c new file mode 100644 index 0000000000000..faec33f390cc6 --- /dev/null +++ b/pandas/src/ujson/python/JSONtoObj.c @@ -0,0 +1,650 @@ +#include +#define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY +#define NO_IMPORT_ARRAY +#include +#include + + +typedef struct __PyObjectDecoder +{ + JSONObjectDecoder dec; + + void* npyarr; // Numpy context buffer + npy_intp curdim; // Current array dimension + + PyArray_Descr* dtype; +} PyObjectDecoder; + +typedef struct __NpyArrContext +{ + PyObject* ret; + PyObject* labels[2]; + PyArray_Dims shape; + + PyObjectDecoder* dec; + + npy_intp i; + npy_intp elsize; + npy_intp elcount; +} NpyArrContext; + +//#define PRINTMARK() fprintf(stderr, "%s: MARK(%d)\n", __FILE__, __LINE__) +#define PRINTMARK() + +// Numpy handling based on numpy internal code, specifically the function +// PyArray_FromIter. + +// numpy related functions are inter-dependent so declare them all here, +// to ensure the compiler catches any errors + +// standard numpy array handling +JSOBJ Object_npyNewArray(void* decoder); +JSOBJ Object_npyEndArray(JSOBJ obj); +int Object_npyArrayAddItem(JSOBJ obj, JSOBJ value); + +// for more complex dtypes (object and string) fill a standard Python list +// and convert to a numpy array when done. +JSOBJ Object_npyNewArrayList(void* decoder); +JSOBJ Object_npyEndArrayList(JSOBJ obj); +int Object_npyArrayListAddItem(JSOBJ obj, JSOBJ value); + +// labelled support, encode keys and values of JS object into separate numpy +// arrays +JSOBJ Object_npyNewObject(void* decoder); +JSOBJ Object_npyEndObject(JSOBJ obj); +int Object_npyObjectAddKey(JSOBJ obj, JSOBJ name, JSOBJ value); + + +// free the numpy context buffer +void Npy_releaseContext(NpyArrContext* npyarr) +{ + PRINTMARK(); + if (npyarr) + { + if (npyarr->shape.ptr) + { + PyObject_Free(npyarr->shape.ptr); + } + if (npyarr->dec) + { + // Don't set to null, used to make sure we don't Py_DECREF npyarr + // in releaseObject + // npyarr->dec->npyarr = NULL; + npyarr->dec->curdim = 0; + } + Py_XDECREF(npyarr->labels[0]); + Py_XDECREF(npyarr->labels[1]); + Py_XDECREF(npyarr->ret); + PyObject_Free(npyarr); + } +} + +JSOBJ Object_npyNewArray(void* _decoder) +{ + PRINTMARK(); + PyObjectDecoder* decoder = (PyObjectDecoder*) _decoder; + NpyArrContext* npyarr; + if (decoder->curdim <= 0) + { + // start of array - initialise the context buffer + npyarr = decoder->npyarr = PyObject_Malloc(sizeof(NpyArrContext)); + + if (!npyarr) + { + PyErr_NoMemory(); + return NULL; + } + + npyarr->dec = decoder; + npyarr->labels[0] = npyarr->labels[1] = NULL; + + npyarr->shape.ptr = PyObject_Malloc(sizeof(npy_intp)*NPY_MAXDIMS); + npyarr->shape.len = 1; + npyarr->ret = NULL; + + npyarr->elsize = 0; + npyarr->elcount = 4; + npyarr->i = 0; + } + else + { + // starting a new dimension continue the current array (and reshape after) + npyarr = (NpyArrContext*) decoder->npyarr; + if (decoder->curdim >= npyarr->shape.len) + { + npyarr->shape.len++; + } + } + + npyarr->shape.ptr[decoder->curdim] = 0; + decoder->curdim++; + return npyarr; +} + +JSOBJ Object_npyEndArray(JSOBJ obj) +{ + PRINTMARK(); + NpyArrContext* npyarr = (NpyArrContext*) obj; + if (!npyarr) + { + return NULL; + } + + PyObject* ret = npyarr->ret; + int emptyType = NPY_DEFAULT_TYPE; + npy_intp i = npyarr->i; + char* new_data; + + npyarr->dec->curdim--; + + if (i == 0 || !npyarr->ret) { + // empty array would not have been initialised so do it now. + if (npyarr->dec->dtype) + { + emptyType = npyarr->dec->dtype->type_num; + } + npyarr->ret = ret = PyArray_EMPTY(npyarr->shape.len, npyarr->shape.ptr, emptyType, 0); + } + else if (npyarr->dec->curdim <= 0) + { + // realloc to final size + new_data = PyDataMem_RENEW(PyArray_DATA(ret), i * npyarr->elsize); + if (new_data == NULL) { + PyErr_NoMemory(); + Npy_releaseContext(npyarr); + return NULL; + } + ((char*)PyArray_DATA(ret)) = new_data; + } + + if (npyarr->dec->curdim <= 0) + { + // finished decoding array, reshape if necessary + if (npyarr->shape.len > 1) + { + npyarr->ret = PyArray_Newshape((PyArrayObject*) ret, &npyarr->shape, NPY_ANYORDER); + Py_DECREF(ret); + ret = npyarr->ret; + } + + if (npyarr->labels[0] || npyarr->labels[1]) + { + // finished decoding, build tuple with values and labels + ret = PyTuple_New(npyarr->shape.len+1); + for (i = 0; i < npyarr->shape.len; i++) + { + if (npyarr->labels[i]) + { + PyTuple_SET_ITEM(ret, i+1, npyarr->labels[i]); + npyarr->labels[i] = NULL; + } + else + { + Py_INCREF(Py_None); + PyTuple_SET_ITEM(ret, i+1, Py_None); + } + } + PyTuple_SET_ITEM(ret, 0, npyarr->ret); + } + npyarr->ret = NULL; + Npy_releaseContext(npyarr); + } + + return ret; +} + +int Object_npyArrayAddItem(JSOBJ obj, JSOBJ value) +{ + PRINTMARK(); + NpyArrContext* npyarr = (NpyArrContext*) obj; + if (!npyarr) + { + return 0; + } + + PyObject* type; + PyArray_Descr* dtype; + npy_intp i = npyarr->i; + char *new_data, *item; + + npyarr->shape.ptr[npyarr->dec->curdim-1]++; + + if (PyArray_Check(value)) + { + // multidimensional array, keep decoding values. + return 1; + } + + if (!npyarr->ret) + { + // Array not initialised yet. + // We do it here so we can 'sniff' the data type if none was provided + if (!npyarr->dec->dtype) + { + type = PyObject_Type(value); + if(!PyArray_DescrConverter(type, &dtype)) + { + Py_DECREF(type); + goto fail; + } + Py_INCREF(dtype); + Py_DECREF(type); + } + else + { + dtype = PyArray_DescrNew(npyarr->dec->dtype); + } + + // If it's an object or string then fill a Python list and subsequently + // convert. Otherwise we would need to somehow mess about with + // reference counts when renewing memory. + npyarr->elsize = dtype->elsize; + if (PyDataType_REFCHK(dtype) || npyarr->elsize == 0) + { + Py_XDECREF(dtype); + + if (npyarr->dec->curdim > 1) + { + PyErr_SetString(PyExc_ValueError, "Cannot decode multidimensional arrays with variable length elements to numpy"); + goto fail; + } + npyarr->ret = PyList_New(0); + if (!npyarr->ret) + { + goto fail; + } + ((JSONObjectDecoder*)npyarr->dec)->newArray = Object_npyNewArrayList; + ((JSONObjectDecoder*)npyarr->dec)->arrayAddItem = Object_npyArrayListAddItem; + ((JSONObjectDecoder*)npyarr->dec)->endArray = Object_npyEndArrayList; + return Object_npyArrayListAddItem(obj, value); + } + + npyarr->ret = PyArray_NewFromDescr(&PyArray_Type, dtype, 1, + &npyarr->elcount, NULL,NULL, 0, NULL); + + if (!npyarr->ret) + { + goto fail; + } + } + + if (i >= npyarr->elcount) { + // Grow PyArray_DATA(ret): + // this is similar for the strategy for PyListObject, but we use + // 50% overallocation => 0, 4, 8, 14, 23, 36, 56, 86 ... + if (npyarr->elsize == 0) + { + PyErr_SetString(PyExc_ValueError, "Cannot decode multidimensional arrays with variable length elements to numpy"); + goto fail; + } + + npyarr->elcount = (i >> 1) + (i < 4 ? 4 : 2) + i; + if (npyarr->elcount <= NPY_MAX_INTP/npyarr->elsize) { + new_data = PyDataMem_RENEW(PyArray_DATA(npyarr->ret), npyarr->elcount * npyarr->elsize); + } + else { + PyErr_NoMemory(); + goto fail; + } + ((char*)PyArray_DATA(npyarr->ret)) = new_data; + } + + PyArray_DIMS(npyarr->ret)[0] = i + 1; + + if ((item = PyArray_GETPTR1(npyarr->ret, i)) == NULL + || PyArray_SETITEM(npyarr->ret, item, value) == -1) { + goto fail; + } + + Py_DECREF( (PyObject *) value); + npyarr->i++; + return 1; + +fail: + + Npy_releaseContext(npyarr); + return 0; +} + +JSOBJ Object_npyNewArrayList(void* _decoder) +{ + PRINTMARK(); + PyObjectDecoder* decoder = (PyObjectDecoder*) _decoder; + PyErr_SetString(PyExc_ValueError, "nesting not supported for object or variable length dtypes"); + Npy_releaseContext(decoder->npyarr); + return NULL; +} + +JSOBJ Object_npyEndArrayList(JSOBJ obj) +{ + PRINTMARK(); + NpyArrContext* npyarr = (NpyArrContext*) obj; + if (!npyarr) + { + return NULL; + } + + // convert decoded list to numpy array + PyObject* list = (PyObject *) npyarr->ret; + PyObject* ret = PyArray_FROM_O(list); + + ((JSONObjectDecoder*)npyarr->dec)->newArray = Object_npyNewArray; + ((JSONObjectDecoder*)npyarr->dec)->arrayAddItem = Object_npyArrayAddItem; + ((JSONObjectDecoder*)npyarr->dec)->endArray = Object_npyEndArray; + Npy_releaseContext(npyarr); + return ret; +} + +int Object_npyArrayListAddItem(JSOBJ obj, JSOBJ value) +{ + PRINTMARK(); + NpyArrContext* npyarr = (NpyArrContext*) obj; + if (!npyarr) + { + return 0; + } + PyList_Append((PyObject*) npyarr->ret, value); + Py_DECREF( (PyObject *) value); + return 1; +} + + +JSOBJ Object_npyNewObject(void* _decoder) +{ + PRINTMARK(); + PyObjectDecoder* decoder = (PyObjectDecoder*) _decoder; + if (decoder->curdim > 1) + { + PyErr_SetString(PyExc_ValueError, "labels only supported up to 2 dimensions"); + return NULL; + } + + return ((JSONObjectDecoder*)decoder)->newArray(decoder); +} + +JSOBJ Object_npyEndObject(JSOBJ obj) +{ + PRINTMARK(); + NpyArrContext* npyarr = (NpyArrContext*) obj; + if (!npyarr) + { + return NULL; + } + + npy_intp labelidx = npyarr->dec->curdim-1; + + PyObject* list = npyarr->labels[labelidx]; + if (list) + { + npyarr->labels[labelidx] = PyArray_FROM_O(list); + Py_DECREF(list); + } + + return (PyObject*) ((JSONObjectDecoder*)npyarr->dec)->endArray(obj); +} + +int Object_npyObjectAddKey(JSOBJ obj, JSOBJ name, JSOBJ value) +{ + PRINTMARK(); + // add key to label array, value to values array + NpyArrContext* npyarr = (NpyArrContext*) obj; + if (!npyarr) + { + return 0; + } + + PyObject* label = (PyObject*) name; + npy_intp labelidx = npyarr->dec->curdim-1; + + if (!npyarr->labels[labelidx]) + { + npyarr->labels[labelidx] = PyList_New(0); + } + + // only fill label array once, assumes all column labels are the same + // for 2-dimensional arrays. + if (PyList_GET_SIZE(npyarr->labels[labelidx]) <= npyarr->elcount) + { + PyList_Append(npyarr->labels[labelidx], label); + } + + if(((JSONObjectDecoder*)npyarr->dec)->arrayAddItem(obj, value)) + { + Py_DECREF(label); + return 1; + } + return 0; +} + +int Object_objectAddKey(JSOBJ obj, JSOBJ name, JSOBJ value) +{ + PyDict_SetItem (obj, name, value); + Py_DECREF( (PyObject *) name); + Py_DECREF( (PyObject *) value); + return 1; +} + +int Object_arrayAddItem(JSOBJ obj, JSOBJ value) +{ + PyList_Append(obj, value); + Py_DECREF( (PyObject *) value); + return 1; +} + +JSOBJ Object_newString(wchar_t *start, wchar_t *end) +{ + return PyUnicode_FromWideChar (start, (end - start)); +} + +JSOBJ Object_newTrue(void) +{ + Py_RETURN_TRUE; +} + +JSOBJ Object_newFalse(void) +{ + Py_RETURN_FALSE; +} + +JSOBJ Object_newNull(void) +{ + Py_RETURN_NONE; +} + +JSOBJ Object_newObject(void* decoder) +{ + return PyDict_New(); +} + +JSOBJ Object_endObject(JSOBJ obj) +{ + return obj; +} + +JSOBJ Object_newArray(void* decoder) +{ + return PyList_New(0); +} + +JSOBJ Object_endArray(JSOBJ obj) +{ + return obj; +} + +JSOBJ Object_newInteger(JSINT32 value) +{ + return PyInt_FromLong( (long) value); +} + +JSOBJ Object_newLong(JSINT64 value) +{ + return PyLong_FromLongLong (value); +} + +JSOBJ Object_newDouble(double value) +{ + return PyFloat_FromDouble(value); +} + +static void Object_releaseObject(JSOBJ obj, void* _decoder) +{ + PyObjectDecoder* decoder = (PyObjectDecoder*) _decoder; + if (obj != decoder->npyarr) + { + Py_XDECREF( ((PyObject *)obj)); + } +} + + +PyObject* JSONToObj(PyObject* self, PyObject *args, PyObject *kwargs) +{ + PRINTMARK(); + static char *kwlist[] = { "obj", "numpy", "labelled", "dtype", NULL}; + + PyObject *ret; + PyObject *sarg; + PyArray_Descr *dtype = NULL; + int numpy = 0, labelled = 0, decref = 0; + + PyObjectDecoder pyDecoder = + { + { + Object_newString, + Object_objectAddKey, + Object_arrayAddItem, + Object_newTrue, + Object_newFalse, + Object_newNull, + Object_newObject, + Object_endObject, + Object_newArray, + Object_endArray, + Object_newInteger, + Object_newLong, + Object_newDouble, + Object_releaseObject, + PyObject_Malloc, + PyObject_Free, + PyObject_Realloc, + } + }; + + pyDecoder.curdim = 0; + pyDecoder.npyarr = NULL; + + JSONObjectDecoder* decoder = (JSONObjectDecoder*) &pyDecoder; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|iiO&", kwlist, &sarg, &numpy, &labelled, PyArray_DescrConverter, &dtype)) + { + return NULL; + } + + if (PyUnicode_Check(sarg)) + { + sarg = PyUnicode_AsUTF8String(sarg); + if (sarg == NULL) + { + //Exception raised above us by codec according to docs + return NULL; + } + decref = 1; + } + else + if (!PyString_Check(sarg)) + { + PyErr_Format(PyExc_TypeError, "Expected String or Unicode"); + return NULL; + } + + if (numpy) + { + pyDecoder.dtype = dtype; + decoder->newArray = Object_npyNewArray; + decoder->endArray = Object_npyEndArray; + decoder->arrayAddItem = Object_npyArrayAddItem; + + if (labelled) + { + decoder->newObject = Object_npyNewObject; + decoder->endObject = Object_npyEndObject; + decoder->objectAddKey = Object_npyObjectAddKey; + } + } + + decoder->errorStr = NULL; + decoder->errorOffset = NULL; + + PRINTMARK(); + ret = JSON_DecodeObject(decoder, PyString_AS_STRING(sarg), PyString_GET_SIZE(sarg)); + PRINTMARK(); + + if (decref) + { + Py_DECREF(sarg); + } + + if (PyErr_Occurred()) + { + return NULL; + } + + if (decoder->errorStr) + { + /*FIXME: It's possible to give a much nicer error message here with actual failing element in input etc*/ + PyErr_Format (PyExc_ValueError, "%s", decoder->errorStr); + Py_XDECREF( (PyObject *) ret); + Npy_releaseContext(pyDecoder.npyarr); + + return NULL; + } + + return ret; +} + +PyObject* JSONFileToObj(PyObject* self, PyObject *args, PyObject *kwargs) +{ + PyObject *file; + PyObject *read; + PyObject *string; + PyObject *result; + PyObject *argtuple; + + if (!PyArg_ParseTuple (args, "O", &file)) { + return NULL; + } + + if (!PyObject_HasAttrString (file, "read")) + { + PyErr_Format (PyExc_TypeError, "expected file"); + return NULL; + } + + read = PyObject_GetAttrString (file, "read"); + + if (!PyCallable_Check (read)) { + Py_XDECREF(read); + PyErr_Format (PyExc_TypeError, "expected file"); + return NULL; + } + + string = PyObject_CallObject (read, NULL); + Py_XDECREF(read); + + if (string == NULL) + { + return NULL; + } + + argtuple = PyTuple_Pack(1, string); + + result = JSONToObj (self, argtuple, kwargs); + Py_XDECREF(string); + Py_DECREF(argtuple); + + if (result == NULL) { + return NULL; + } + + return result; +} + diff --git a/pandas/src/ujson/python/objToJSON.c b/pandas/src/ujson/python/objToJSON.c new file mode 100644 index 0000000000000..3c6a2a929644c --- /dev/null +++ b/pandas/src/ujson/python/objToJSON.c @@ -0,0 +1,1554 @@ +#include +#define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY +#include +#include +#include +#include +#include + +#define EPOCH_ORD 719163 + +static PyObject* cls_dataframe; +static PyObject* cls_series; +static PyObject* cls_index; + +typedef void *(*PFN_PyTypeToJSON)(JSOBJ obj, JSONTypeContext *ti, void *outValue, size_t *_outLen); + + +#if (PY_VERSION_HEX < 0x02050000) +typedef ssize_t Py_ssize_t; +#endif + +typedef struct __NpyArrContext +{ + PyObject *array; + char* dataptr; + int curdim; // current dimension in array's order + int stridedim; // dimension we are striding over + int inc; // stride dimension increment (+/- 1) + npy_intp dim; + npy_intp stride; + npy_intp ndim; + npy_intp index[NPY_MAXDIMS]; + PyArray_GetItemFunc* getitem; + + char** rowLabels; + char** columnLabels; +} NpyArrContext; + +typedef struct __TypeContext +{ + JSPFN_ITERBEGIN iterBegin; + JSPFN_ITEREND iterEnd; + JSPFN_ITERNEXT iterNext; + JSPFN_ITERGETNAME iterGetName; + JSPFN_ITERGETVALUE iterGetValue; + PFN_PyTypeToJSON PyTypeToJSON; + PyObject *newObj; + PyObject *dictObj; + Py_ssize_t index; + Py_ssize_t size; + PyObject *itemValue; + PyObject *itemName; + PyObject *attrList; + char *citemName; + + JSINT64 longValue; + + NpyArrContext *npyarr; + int transpose; + char** rowLabels; + char** columnLabels; + npy_intp rowLabelsLen; + npy_intp columnLabelsLen; + +} TypeContext; + +typedef struct __PyObjectEncoder +{ + JSONObjectEncoder enc; + + // pass through the NpyArrContext when encoding multi-dimensional arrays + NpyArrContext* npyCtxtPassthru; + + // output format style for pandas data types + int outputFormat; +} PyObjectEncoder; + +#define GET_TC(__ptrtc) ((TypeContext *)((__ptrtc)->prv)) + +struct PyDictIterState +{ + PyObject *keys; + size_t i; + size_t sz; +}; + +enum PANDAS_FORMAT +{ + SPLIT, + RECORDS, + INDEX, + COLUMNS, + VALUES +}; + +//#define PRINTMARK() fprintf(stderr, "%s: MARK(%d)\n", __FILE__, __LINE__) +#define PRINTMARK() + +void initObjToJSON(void) +{ + PyDateTime_IMPORT; + + PyObject *mod_frame = PyImport_ImportModule("pandas.core.frame"); + cls_dataframe = PyObject_GetAttrString(mod_frame, "DataFrame"); + cls_index = PyObject_GetAttrString(mod_frame, "Index"); + cls_series = PyObject_GetAttrString(mod_frame, "Series"); + Py_DECREF(mod_frame); + + /* Initialise numpy API */ + import_array(); +} + +static void *PyIntToINT32(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + *((JSINT32 *) outValue) = PyInt_AS_LONG (obj); + return NULL; +} + +static void *PyIntToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + *((JSINT64 *) outValue) = PyInt_AS_LONG (obj); + return NULL; +} + +static void *PyLongToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + *((JSINT64 *) outValue) = GET_TC(tc)->longValue; + return NULL; +} + +static void *NpyHalfToDOUBLE(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + unsigned long ctype; + PyArray_ScalarAsCtype(obj, &ctype); + *((double *) outValue) = npy_half_to_double (ctype); + return NULL; +} + +static void *NpyFloatToDOUBLE(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + PyArray_CastScalarToCtype(obj, outValue, PyArray_DescrFromType(NPY_DOUBLE)); + return NULL; +} + +static void *PyFloatToDOUBLE(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + *((double *) outValue) = PyFloat_AS_DOUBLE (obj); + return NULL; +} + +static void *PyStringToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + *_outLen = PyString_GET_SIZE(obj); + return PyString_AS_STRING(obj); +} + +static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + PyObject *newObj = PyUnicode_EncodeUTF8 (PyUnicode_AS_UNICODE(obj), PyUnicode_GET_SIZE(obj), NULL); + + GET_TC(tc)->newObj = newObj; + + *_outLen = PyString_GET_SIZE(newObj); + return PyString_AS_STRING(newObj); +} + +static void *PyDateTimeToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + int y, m, d, h, mn, s, days; + + y = PyDateTime_GET_YEAR(obj); + m = PyDateTime_GET_MONTH(obj); + d = PyDateTime_GET_DAY(obj); + h = PyDateTime_DATE_GET_HOUR(obj); + mn = PyDateTime_DATE_GET_MINUTE(obj); + s = PyDateTime_DATE_GET_SECOND(obj); + + days = PyInt_AS_LONG(PyObject_CallMethod(PyDate_FromDate(y, m, 1), "toordinal", NULL)) - EPOCH_ORD + d - 1; + *( (JSINT64 *) outValue) = (((JSINT64) ((days * 24 + h) * 60 + mn)) * 60 + s); + return NULL; +} + +static void *PyDateToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + int y, m, d, days; + + y = PyDateTime_GET_YEAR(obj); + m = PyDateTime_GET_MONTH(obj); + d = PyDateTime_GET_DAY(obj); + + days = PyInt_AS_LONG(PyObject_CallMethod(PyDate_FromDate(y, m, 1), "toordinal", NULL)) - EPOCH_ORD + d - 1; + *( (JSINT64 *) outValue) = ((JSINT64) days * 86400); + + return NULL; +} + +//============================================================================= +// Numpy array iteration functions +//============================================================================= +int NpyArr_iterNextNone(JSOBJ _obj, JSONTypeContext *tc) +{ + return 0; +} + +void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) +{ + PyArrayObject *obj; + + if (GET_TC(tc)->newObj) + { + obj = (PyArrayObject *) GET_TC(tc)->newObj; + } + else + { + obj = (PyArrayObject *) _obj; + } + + if (PyArray_SIZE(obj) > 0) + { + PRINTMARK(); + NpyArrContext *npyarr = PyMem_Malloc(sizeof(NpyArrContext)); + GET_TC(tc)->npyarr = npyarr; + + if (!npyarr) + { + PyErr_NoMemory(); + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + return; + } + + npyarr->array = (PyObject*) obj; + npyarr->getitem = (PyArray_GetItemFunc*) PyArray_DESCR(obj)->f->getitem; + npyarr->dataptr = PyArray_DATA(obj); + npyarr->ndim = PyArray_NDIM(obj) - 1; + npyarr->curdim = 0; + + if (GET_TC(tc)->transpose) + { + npyarr->dim = PyArray_DIM(obj, npyarr->ndim); + npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim); + npyarr->stridedim = npyarr->ndim; + npyarr->index[npyarr->ndim] = 0; + npyarr->inc = -1; + } + else + { + npyarr->dim = PyArray_DIM(obj, 0); + npyarr->stride = PyArray_STRIDE(obj, 0); + npyarr->stridedim = 0; + npyarr->index[0] = 0; + npyarr->inc = 1; + } + + npyarr->columnLabels = GET_TC(tc)->columnLabels; + npyarr->rowLabels = GET_TC(tc)->rowLabels; + } + else + { + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + } + PRINTMARK(); +} + +void NpyArr_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + if (GET_TC(tc)->npyarr) + { + PyMem_Free(GET_TC(tc)->npyarr); + } + Py_XDECREF(GET_TC(tc)->newObj); + PRINTMARK(); +} + +void NpyArrPassThru_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + PRINTMARK(); +} + +void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + PRINTMARK(); + // finished this dimension, reset the data pointer + NpyArrContext* npyarr = GET_TC(tc)->npyarr; + npyarr->curdim--; + npyarr->dataptr -= npyarr->stride * npyarr->index[npyarr->stridedim]; + npyarr->stridedim -= npyarr->inc; + npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); + npyarr->dataptr += npyarr->stride; +} + +int NpyArr_iterNextItem(JSOBJ _obj, JSONTypeContext *tc) +{ + PRINTMARK(); + NpyArrContext* npyarr = GET_TC(tc)->npyarr; + + if (npyarr->index[npyarr->stridedim] >= npyarr->dim) + { + return 0; + } + + GET_TC(tc)->itemValue = npyarr->getitem(npyarr->dataptr, npyarr->array); + + npyarr->dataptr += npyarr->stride; + npyarr->index[npyarr->stridedim]++; + return 1; +} + +int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) +{ + PRINTMARK(); + NpyArrContext *npyarr = GET_TC(tc)->npyarr; + + if (npyarr->curdim >= npyarr->ndim || npyarr->index[npyarr->stridedim] >= npyarr->dim) + { + // innermost dimension, start retrieving item values + GET_TC(tc)->iterNext = NpyArr_iterNextItem; + return NpyArr_iterNextItem(_obj, tc); + } + + // dig a dimension deeper + npyarr->index[npyarr->stridedim]++; + + npyarr->curdim++; + npyarr->stridedim += npyarr->inc; + npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); + npyarr->index[npyarr->stridedim] = 0; + + ((PyObjectEncoder*) tc->encoder)->npyCtxtPassthru = npyarr; + GET_TC(tc)->itemValue = npyarr->array; + return 1; +} + +JSOBJ NpyArr_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + PRINTMARK(); + return GET_TC(tc)->itemValue; +} + +char *NpyArr_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + PRINTMARK(); + NpyArrContext *npyarr = GET_TC(tc)->npyarr; + npy_intp idx; + if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) + { + idx = npyarr->index[npyarr->stridedim] - 1; + *outLen = strlen(npyarr->columnLabels[idx]); + return npyarr->columnLabels[idx]; + } + else + { + idx = npyarr->index[npyarr->stridedim - npyarr->inc] - 1; + *outLen = strlen(npyarr->rowLabels[idx]); + return npyarr->rowLabels[idx]; + } +} + +//============================================================================= +// Tuple iteration functions +// itemValue is borrowed reference, no ref counting +//============================================================================= +void Tuple_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->index = 0; + GET_TC(tc)->size = PyTuple_GET_SIZE( (PyObject *) obj); + GET_TC(tc)->itemValue = NULL; +} + +int Tuple_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ + PyObject *item; + + if (GET_TC(tc)->index >= GET_TC(tc)->size) + { + return 0; + } + + item = PyTuple_GET_ITEM (obj, GET_TC(tc)->index); + + GET_TC(tc)->itemValue = item; + GET_TC(tc)->index ++; + return 1; +} + +void Tuple_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ +} + +JSOBJ Tuple_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->itemValue; +} + +char *Tuple_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + return NULL; +} + +//============================================================================= +// Dir iteration functions +// itemName ref is borrowed from PyObject_Dir (attrList). No refcount +// itemValue ref is from PyObject_GetAttr. Ref counted +//============================================================================= +void Dir_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->attrList = PyObject_Dir(obj); + GET_TC(tc)->index = 0; + GET_TC(tc)->size = PyList_GET_SIZE(GET_TC(tc)->attrList); + PRINTMARK(); +} + +void Dir_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + if (GET_TC(tc)->itemValue) + { + Py_DECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = NULL; + } + + Py_DECREF( (PyObject *) GET_TC(tc)->attrList); + PRINTMARK(); +} + +int Dir_iterNext(JSOBJ _obj, JSONTypeContext *tc) +{ + PyObject *obj = (PyObject *) _obj; + PyObject *itemValue = GET_TC(tc)->itemValue; + PyObject *itemName = NULL; + + + if (itemValue) + { + Py_DECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = itemValue = NULL; + } + + for (; GET_TC(tc)->index < GET_TC(tc)->size; GET_TC(tc)->index ++) + { + PyObject* attr = PyList_GET_ITEM(GET_TC(tc)->attrList, GET_TC(tc)->index); + char* attrStr = PyString_AS_STRING(attr); + + if (attrStr[0] == '_') + { + PRINTMARK(); + continue; + } + + itemValue = PyObject_GetAttr(obj, attr); + if (itemValue == NULL) + { + PyErr_Clear(); + PRINTMARK(); + continue; + } + + if (PyCallable_Check(itemValue)) + { + Py_DECREF(itemValue); + PRINTMARK(); + continue; + } + + PRINTMARK(); + itemName = attr; + break; + } + + if (itemName == NULL) + { + GET_TC(tc)->index = GET_TC(tc)->size; + GET_TC(tc)->itemValue = NULL; + return 0; + } + + GET_TC(tc)->itemName = itemName; + GET_TC(tc)->itemValue = itemValue; + GET_TC(tc)->index ++; + + PRINTMARK(); + return 1; +} + + + +JSOBJ Dir_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + PRINTMARK(); + return GET_TC(tc)->itemValue; +} + +char *Dir_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + PRINTMARK(); + *outLen = PyString_GET_SIZE(GET_TC(tc)->itemName); + return PyString_AS_STRING(GET_TC(tc)->itemName); +} + + + + +//============================================================================= +// List iteration functions +// itemValue is borrowed from object (which is list). No refcounting +//============================================================================= +void List_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->index = 0; + GET_TC(tc)->size = PyList_GET_SIZE( (PyObject *) obj); +} + +int List_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ + if (GET_TC(tc)->index >= GET_TC(tc)->size) + { + PRINTMARK(); + return 0; + } + + GET_TC(tc)->itemValue = PyList_GET_ITEM (obj, GET_TC(tc)->index); + GET_TC(tc)->index ++; + return 1; +} + +void List_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ +} + +JSOBJ List_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->itemValue; +} + +char *List_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + return NULL; +} + +//============================================================================= +// pandas Index iteration functions +//============================================================================= +void Index_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->index = 0; + GET_TC(tc)->citemName = PyMem_Malloc(20 * sizeof(char)); + if (!GET_TC(tc)->citemName) + { + PyErr_NoMemory(); + } + PRINTMARK(); +} + +int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ + if (!GET_TC(tc)->citemName) + { + return 0; + } + + Py_ssize_t index = GET_TC(tc)->index; + Py_XDECREF(GET_TC(tc)->itemValue); + if (index == 0) + { + memcpy(GET_TC(tc)->citemName, "name", 5); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); + } + else + if (index == 1) + { + memcpy(GET_TC(tc)->citemName, "data", 5); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "values"); + } + else + { + PRINTMARK(); + return 0; + } + + GET_TC(tc)->index++; + PRINTMARK(); + return 1; +} + +void Index_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + if (GET_TC(tc)->citemName) + { + PyMem_Free(GET_TC(tc)->citemName); + } + PRINTMARK(); +} + +JSOBJ Index_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->itemValue; +} + +char *Index_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + *outLen = strlen(GET_TC(tc)->citemName); + return GET_TC(tc)->citemName; +} + +//============================================================================= +// pandas Series iteration functions +//============================================================================= +void Series_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->index = 0; + GET_TC(tc)->citemName = PyMem_Malloc(20 * sizeof(char)); + if (!GET_TC(tc)->citemName) + { + PyErr_NoMemory(); + } + PRINTMARK(); +} + +int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ + if (!GET_TC(tc)->citemName) + { + return 0; + } + + Py_ssize_t index = GET_TC(tc)->index; + Py_XDECREF(GET_TC(tc)->itemValue); + if (index == 0) + { + memcpy(GET_TC(tc)->citemName, "name", 5); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); + } + else + if (index == 1) + { + memcpy(GET_TC(tc)->citemName, "index", 6); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); + } + else + if (index == 2) + { + memcpy(GET_TC(tc)->citemName, "data", 5); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "values"); + } + else + { + PRINTMARK(); + return 0; + } + + GET_TC(tc)->index++; + PRINTMARK(); + return 1; +} + +void Series_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + if (GET_TC(tc)->citemName) + { + PyMem_Free(GET_TC(tc)->citemName); + } + PRINTMARK(); +} + +JSOBJ Series_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->itemValue; +} + +char *Series_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + *outLen = strlen(GET_TC(tc)->citemName); + return GET_TC(tc)->citemName; +} + +//============================================================================= +// pandas DataFrame iteration functions +//============================================================================= +void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->index = 0; + GET_TC(tc)->citemName = PyMem_Malloc(20 * sizeof(char)); + if (!GET_TC(tc)->citemName) + { + PyErr_NoMemory(); + } + PRINTMARK(); +} + +int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ + if (!GET_TC(tc)->citemName) + { + return 0; + } + + Py_ssize_t index = GET_TC(tc)->index; + Py_XDECREF(GET_TC(tc)->itemValue); + if (index == 0) + { + memcpy(GET_TC(tc)->citemName, "columns", 8); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "columns"); + } + else + if (index == 1) + { + memcpy(GET_TC(tc)->citemName, "index", 6); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); + } + else + if (index == 2) + { + memcpy(GET_TC(tc)->citemName, "data", 5); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "values"); + } + else + { + PRINTMARK(); + return 0; + } + + GET_TC(tc)->index++; + PRINTMARK(); + return 1; +} + +void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + if (GET_TC(tc)->citemName) + { + PyMem_Free(GET_TC(tc)->citemName); + } + PRINTMARK(); +} + +JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->itemValue; +} + +char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + *outLen = strlen(GET_TC(tc)->citemName); + return GET_TC(tc)->citemName; +} + +//============================================================================= +// Dict iteration functions +// itemName might converted to string (Python_Str). Do refCounting +// itemValue is borrowed from object (which is dict). No refCounting +//============================================================================= +void Dict_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->index = 0; + PRINTMARK(); +} + +int Dict_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ + if (GET_TC(tc)->itemName) + { + Py_DECREF(GET_TC(tc)->itemName); + GET_TC(tc)->itemName = NULL; + } + + + if (!PyDict_Next ( (PyObject *)GET_TC(tc)->dictObj, &GET_TC(tc)->index, &GET_TC(tc)->itemName, &GET_TC(tc)->itemValue)) + { + PRINTMARK(); + return 0; + } + + if (PyUnicode_Check(GET_TC(tc)->itemName)) + { + GET_TC(tc)->itemName = PyUnicode_EncodeUTF8 ( + PyUnicode_AS_UNICODE(GET_TC(tc)->itemName), + PyUnicode_GET_SIZE(GET_TC(tc)->itemName), + NULL + ); + } + else + if (!PyString_Check(GET_TC(tc)->itemName)) + { + GET_TC(tc)->itemName = PyObject_Str(GET_TC(tc)->itemName); + } + else + { + Py_INCREF(GET_TC(tc)->itemName); + } + PRINTMARK(); + return 1; +} + +void Dict_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + if (GET_TC(tc)->itemName) + { + Py_DECREF(GET_TC(tc)->itemName); + GET_TC(tc)->itemName = NULL; + } + Py_DECREF(GET_TC(tc)->dictObj); + PRINTMARK(); +} + +JSOBJ Dict_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->itemValue; +} + +char *Dict_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + *outLen = PyString_GET_SIZE(GET_TC(tc)->itemName); + return PyString_AS_STRING(GET_TC(tc)->itemName); +} + +void NpyArr_freeLabels(char** labels, npy_intp len) +{ + npy_intp i; + + if (labels) + { + for (i = 0; i < len; i++) + { + PyMem_Free(labels[i]); + } + PyMem_Free(labels); + } +} + +char** NpyArr_encodeLabels(PyArrayObject* labels, JSONObjectEncoder* enc, npy_intp num) +{ + PRINTMARK(); + npy_intp i, stride, bufsize, len; + char** ret; + char *dataptr, *cLabel; + PyArray_GetItemFunc* getitem; + + if (PyArray_SIZE(labels) < num) + { + PyErr_SetString(PyExc_ValueError, "Label array sizes do not match corresponding data shape"); + return 0; + } + + ret = PyMem_Malloc(sizeof(char*)*num); + if (!ret) + { + PyErr_NoMemory(); + return 0; + } + + bufsize = enc->end - enc->start; + stride = PyArray_STRIDE(labels, 0); + dataptr = PyArray_DATA(labels); + getitem = PyArray_DESCR(labels)->f->getitem; + + for (i = 0; i < num; i++) + { + cLabel = JSON_EncodeObject(getitem(dataptr, labels), enc, enc->start, bufsize); + + // trim off any quotes surrounding the result + if (*cLabel == '\"') + { + cLabel++; + enc->offset -= 2; + *(enc->offset) = '\0'; + } + + len = enc->offset - cLabel + 1; + ret[i] = PyMem_Malloc(sizeof(char)*len); + + if (!ret[i]) + { + PyErr_NoMemory(); + return 0; + } + + memcpy(ret[i], cLabel, len); + dataptr += stride; + } + + enc->offset = enc->start; + return ret; +} + +void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) +{ + PRINTMARK(); + if (!_obj) { + tc->type = JT_INVALID; + return; + } + + PyObject* obj = (PyObject*) _obj; + TypeContext *pc = (TypeContext *) tc->prv; + PyObjectEncoder* enc = (PyObjectEncoder*) tc->encoder; + PyObject *toDictFunc; + + int i; + for (i = 0; i < 32; i++) + { + tc->prv[i] = 0; + } + + if (PyIter_Check(obj) || PyArray_Check(obj)) + { + goto ISITERABLE; + } + + if (PyBool_Check(obj)) + { + PRINTMARK(); + tc->type = (obj == Py_True) ? JT_TRUE : JT_FALSE; + return; + } + else + if (PyInt_Check(obj)) + { + PRINTMARK(); +#ifdef _LP64 + pc->PyTypeToJSON = PyIntToINT64; tc->type = JT_LONG; +#else + pc->PyTypeToJSON = PyIntToINT32; tc->type = JT_INT; +#endif + return; + } + else + if (PyLong_Check(obj)) + { + PyObject *exc; + + PRINTMARK(); + pc->PyTypeToJSON = PyLongToINT64; + tc->type = JT_LONG; + GET_TC(tc)->longValue = PyLong_AsLongLong(obj); + + exc = PyErr_Occurred(); + + if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) + { + PRINTMARK(); + tc->type = JT_INVALID; + return; + } + + return; + } + else + if (PyArray_IsScalar(obj, Integer)) + { + PyObject *exc; + + PRINTMARK(); + pc->PyTypeToJSON = PyLongToINT64; + tc->type = JT_LONG; + PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->longValue), PyArray_DescrFromType(NPY_LONG)); + + exc = PyErr_Occurred(); + + if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) + { + PRINTMARK(); + tc->type = JT_INVALID; + return; + } + + return; + } + else + if (PyString_Check(obj)) + { + PRINTMARK(); + pc->PyTypeToJSON = PyStringToUTF8; tc->type = JT_UTF8; + return; + } + else + if (PyUnicode_Check(obj)) + { + PRINTMARK(); + pc->PyTypeToJSON = PyUnicodeToUTF8; tc->type = JT_UTF8; + return; + } + else + if (PyFloat_Check(obj)) + { + PRINTMARK(); + double val = PyFloat_AS_DOUBLE (obj); + if (npy_isnan(val) || npy_isinf(val)) + { + tc->type = JT_NULL; + } + else + { + pc->PyTypeToJSON = PyFloatToDOUBLE; tc->type = JT_DOUBLE; + } + return; + } + else + if (PyArray_IsScalar(obj, Float)) + { + PRINTMARK(); + pc->PyTypeToJSON = NpyFloatToDOUBLE; tc->type = JT_DOUBLE; + return; + } + else + if (PyArray_IsScalar(obj, Half)) + { + PRINTMARK(); + pc->PyTypeToJSON = NpyHalfToDOUBLE; tc->type = JT_DOUBLE; + return; + } + else + if (PyDateTime_Check(obj)) + { + PRINTMARK(); + pc->PyTypeToJSON = PyDateTimeToINT64; tc->type = JT_LONG; + return; + } + else + if (PyDate_Check(obj)) + { + PRINTMARK(); + pc->PyTypeToJSON = PyDateToINT64; tc->type = JT_LONG; + return; + } + else + if (obj == Py_None) + { + PRINTMARK(); + tc->type = JT_NULL; + return; + } + + +ISITERABLE: + + if (PyDict_Check(obj)) + { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->iterBegin = Dict_iterBegin; + pc->iterEnd = Dict_iterEnd; + pc->iterNext = Dict_iterNext; + pc->iterGetValue = Dict_iterGetValue; + pc->iterGetName = Dict_iterGetName; + pc->dictObj = obj; + Py_INCREF(obj); + + return; + } + else + if (PyList_Check(obj)) + { + PRINTMARK(); + tc->type = JT_ARRAY; + pc->iterBegin = List_iterBegin; + pc->iterEnd = List_iterEnd; + pc->iterNext = List_iterNext; + pc->iterGetValue = List_iterGetValue; + pc->iterGetName = List_iterGetName; + return; + } + else + if (PyTuple_Check(obj)) + { + PRINTMARK(); + tc->type = JT_ARRAY; + pc->iterBegin = Tuple_iterBegin; + pc->iterEnd = Tuple_iterEnd; + pc->iterNext = Tuple_iterNext; + pc->iterGetValue = Tuple_iterGetValue; + pc->iterGetName = Tuple_iterGetName; + return; + } + else + if (PyObject_TypeCheck(obj, (PyTypeObject*) cls_index)) + { + if (enc->outputFormat == SPLIT) + { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->iterBegin = Index_iterBegin; + pc->iterEnd = Index_iterEnd; + pc->iterNext = Index_iterNext; + pc->iterGetValue = Index_iterGetValue; + pc->iterGetName = Index_iterGetName; + return; + } + + PRINTMARK(); + tc->type = JT_ARRAY; + pc->newObj = PyObject_GetAttrString(obj, "values"); + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + return; + } + else + if (PyObject_TypeCheck(obj, (PyTypeObject*) cls_series)) + { + if (enc->outputFormat == SPLIT) + { + PRINTMARK(); + enc->outputFormat = RECORDS; // for contained index + tc->type = JT_OBJECT; + pc->iterBegin = Series_iterBegin; + pc->iterEnd = Series_iterEnd; + pc->iterNext = Series_iterNext; + pc->iterGetValue = Series_iterGetValue; + pc->iterGetName = Series_iterGetName; + return; + } + + if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) + { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->columnLabelsLen = PyArray_SIZE(obj); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "index"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); + if (!pc->columnLabels) + { + tc->type = JT_INVALID; + return; + } + } + else + { + PRINTMARK(); + tc->type = JT_ARRAY; + } + pc->newObj = PyObject_GetAttrString(obj, "values"); + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + return; + } + else + if (PyArray_Check(obj)) + { + if (enc->npyCtxtPassthru) + { + PRINTMARK(); + pc->npyarr = enc->npyCtxtPassthru; + tc->type = (pc->npyarr->columnLabels ? JT_OBJECT : JT_ARRAY); + pc->iterBegin = NpyArrPassThru_iterBegin; + pc->iterEnd = NpyArrPassThru_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + enc->npyCtxtPassthru = NULL; + return; + } + + PRINTMARK(); + tc->type = JT_ARRAY; + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + return; + } + else + if (PyObject_TypeCheck(obj, (PyTypeObject*) cls_dataframe)) + { + if (enc->outputFormat == SPLIT) + { + PRINTMARK(); + enc->outputFormat = RECORDS; // for contained index and series + tc->type = JT_OBJECT; + pc->iterBegin = DataFrame_iterBegin; + pc->iterEnd = DataFrame_iterEnd; + pc->iterNext = DataFrame_iterNext; + pc->iterGetValue = DataFrame_iterGetValue; + pc->iterGetName = DataFrame_iterGetName; + return; + } + + PRINTMARK(); + pc->newObj = PyObject_GetAttrString(obj, "values"); + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + if (enc->outputFormat == VALUES) + { + PRINTMARK(); + tc->type = JT_ARRAY; + } + else + if (enc->outputFormat == RECORDS) + { + PRINTMARK(); + tc->type = JT_ARRAY; + pc->columnLabelsLen = PyArray_DIM(pc->newObj, 1); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "columns"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); + if (!pc->columnLabels) + { + tc->type = JT_INVALID; + return; + } + } + else + if (enc->outputFormat == INDEX) + { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->rowLabelsLen = PyArray_DIM(pc->newObj, 0); + pc->rowLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "index"), (JSONObjectEncoder*) enc, pc->rowLabelsLen); + if (!pc->rowLabels) + { + tc->type = JT_INVALID; + return; + } + pc->columnLabelsLen = PyArray_DIM(pc->newObj, 1); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "columns"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); + if (!pc->columnLabels) + { + tc->type = JT_INVALID; + return; + } + } + else + { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->rowLabelsLen = PyArray_DIM(pc->newObj, 1); + pc->rowLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "columns"), (JSONObjectEncoder*) enc, pc->rowLabelsLen); + if (!pc->rowLabels) + { + tc->type = JT_INVALID; + return; + } + pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "index"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); + if (!pc->columnLabels) + { + tc->type = JT_INVALID; + return; + } + pc->transpose = 1; + } + return; + } + + + toDictFunc = PyObject_GetAttrString(obj, "toDict"); + + if (toDictFunc) + { + PyObject* tuple = PyTuple_New(0); + PyObject* toDictResult = PyObject_Call(toDictFunc, tuple, NULL); + Py_DECREF(tuple); + Py_DECREF(toDictFunc); + + if (toDictResult == NULL) + { + PyErr_Clear(); + tc->type = JT_NULL; + return; + } + + if (!PyDict_Check(toDictResult)) + { + Py_DECREF(toDictResult); + tc->type = JT_NULL; + return; + } + + PRINTMARK(); + tc->type = JT_OBJECT; + pc->iterBegin = Dict_iterBegin; + pc->iterEnd = Dict_iterEnd; + pc->iterNext = Dict_iterNext; + pc->iterGetValue = Dict_iterGetValue; + pc->iterGetName = Dict_iterGetName; + pc->dictObj = toDictResult; + return; + } + + PyErr_Clear(); + + tc->type = JT_OBJECT; + pc->iterBegin = Dir_iterBegin; + pc->iterEnd = Dir_iterEnd; + pc->iterNext = Dir_iterNext; + pc->iterGetValue = Dir_iterGetValue; + pc->iterGetName = Dir_iterGetName; + + return; +} + + +void Object_endTypeContext(JSOBJ obj, JSONTypeContext *tc) +{ + Py_XDECREF(GET_TC(tc)->newObj); + NpyArr_freeLabels(GET_TC(tc)->rowLabels, GET_TC(tc)->rowLabelsLen); + NpyArr_freeLabels(GET_TC(tc)->columnLabels, GET_TC(tc)->columnLabelsLen); +} + +const char *Object_getStringValue(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen) +{ + return GET_TC(tc)->PyTypeToJSON (obj, tc, NULL, _outLen); +} + +JSINT64 Object_getLongValue(JSOBJ obj, JSONTypeContext *tc) +{ + JSINT64 ret; + GET_TC(tc)->PyTypeToJSON (obj, tc, &ret, NULL); + + return ret; +} + +JSINT32 Object_getIntValue(JSOBJ obj, JSONTypeContext *tc) +{ + JSINT32 ret; + GET_TC(tc)->PyTypeToJSON (obj, tc, &ret, NULL); + return ret; +} + + +double Object_getDoubleValue(JSOBJ obj, JSONTypeContext *tc) +{ + double ret; + GET_TC(tc)->PyTypeToJSON (obj, tc, &ret, NULL); + return ret; +} + +static void Object_releaseObject(JSOBJ _obj) +{ + Py_DECREF( (PyObject *) _obj); +} + + + +void Object_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->iterBegin(obj, tc); +} + +int Object_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->iterNext(obj, tc); +} + +void Object_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->iterEnd(obj, tc); +} + +JSOBJ Object_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->iterGetValue(obj, tc); +} + +char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + return GET_TC(tc)->iterGetName(obj, tc, outLen); +} + + +PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs) +{ + static char *kwlist[] = { "obj", "ensure_ascii", "double_precision", "orient", NULL}; + + char buffer[65536]; + char *ret; + PyObject *newobj; + PyObject *oinput = NULL; + PyObject *oensureAscii = NULL; + char *sOrient = NULL; + int idoublePrecision = 5; // default double precision setting + + PyObjectEncoder pyEncoder = + { + { + Object_beginTypeContext, //void (*beginTypeContext)(JSOBJ obj, JSONTypeContext *tc); + Object_endTypeContext, //void (*endTypeContext)(JSOBJ obj, JSONTypeContext *tc); + Object_getStringValue, //const char *(*getStringValue)(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen); + Object_getLongValue, //JSLONG (*getLongValue)(JSOBJ obj, JSONTypeContext *tc); + Object_getIntValue, //JSLONG (*getLongValue)(JSOBJ obj, JSONTypeContext *tc); + Object_getDoubleValue, //double (*getDoubleValue)(JSOBJ obj, JSONTypeContext *tc); + Object_iterBegin, //JSPFN_ITERBEGIN iterBegin; + Object_iterNext, //JSPFN_ITERNEXT iterNext; + Object_iterEnd, //JSPFN_ITEREND iterEnd; + Object_iterGetValue, //JSPFN_ITERGETVALUE iterGetValue; + Object_iterGetName, //JSPFN_ITERGETNAME iterGetName; + Object_releaseObject, //void (*releaseValue)(JSONTypeContext *ti); + PyObject_Malloc, //JSPFN_MALLOC malloc; + PyObject_Realloc, //JSPFN_REALLOC realloc; + PyObject_Free, //JSPFN_FREE free; + -1, //recursionMax + idoublePrecision, + 1, //forceAscii + } + }; + JSONObjectEncoder* encoder = (JSONObjectEncoder*) &pyEncoder; + + pyEncoder.npyCtxtPassthru = NULL; + pyEncoder.outputFormat = COLUMNS; + + PRINTMARK(); + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|Ois", kwlist, &oinput, &oensureAscii, &idoublePrecision, &sOrient)) + { + return NULL; + } + + if (sOrient != NULL) + { + if (strcmp(sOrient, "records") == 0) + { + pyEncoder.outputFormat = RECORDS; + } + else + if (strcmp(sOrient, "index") == 0) + { + pyEncoder.outputFormat = INDEX; + } + else + if (strcmp(sOrient, "split") == 0) + { + pyEncoder.outputFormat = SPLIT; + } + else + if (strcmp(sOrient, "values") == 0) + { + pyEncoder.outputFormat = VALUES; + } + else + if (strcmp(sOrient, "columns") != 0) + { + PyErr_Format (PyExc_ValueError, "Invalid value '%s' for option 'orient'", sOrient); + return NULL; + } + } + + if (oensureAscii != NULL && !PyObject_IsTrue(oensureAscii)) + { + encoder->forceASCII = 0; + } + + encoder->doublePrecision = idoublePrecision; + + PRINTMARK(); + ret = JSON_EncodeObject (oinput, encoder, buffer, sizeof (buffer)); + PRINTMARK(); + + if (PyErr_Occurred()) + { + return NULL; + } + + if (encoder->errorMsg) + { + if (ret != buffer) + { + encoder->free (ret); + } + + PyErr_Format (PyExc_OverflowError, "%s", encoder->errorMsg); + return NULL; + } + + newobj = PyString_FromString (ret); + + if (ret != buffer) + { + encoder->free (ret); + } + + PRINTMARK(); + + return newobj; +} + +PyObject* objToJSONFile(PyObject* self, PyObject *args, PyObject *kwargs) +{ + PyObject *data; + PyObject *file; + PyObject *string; + PyObject *write; + PyObject *argtuple; + + PRINTMARK(); + + if (!PyArg_ParseTuple (args, "OO", &data, &file)) { + return NULL; + } + + if (!PyObject_HasAttrString (file, "write")) + { + PyErr_Format (PyExc_TypeError, "expected file"); + return NULL; + } + + write = PyObject_GetAttrString (file, "write"); + + if (!PyCallable_Check (write)) { + Py_XDECREF(write); + PyErr_Format (PyExc_TypeError, "expected file"); + return NULL; + } + + argtuple = PyTuple_Pack(1, data); + + string = objToJSON (self, argtuple, kwargs); + + if (string == NULL) + { + Py_XDECREF(write); + Py_XDECREF(argtuple); + return NULL; + } + + Py_XDECREF(argtuple); + + argtuple = PyTuple_Pack (1, string); + if (argtuple == NULL) + { + Py_XDECREF(write); + return NULL; + } + if (PyObject_CallObject (write, argtuple) == NULL) + { + Py_XDECREF(write); + Py_XDECREF(argtuple); + return NULL; + } + + Py_XDECREF(write); + Py_DECREF(argtuple); + Py_XDECREF(string); + + PRINTMARK(); + + Py_RETURN_NONE; + + +} + diff --git a/pandas/src/ujson/python/ujson.c b/pandas/src/ujson/python/ujson.c new file mode 100644 index 0000000000000..21f7ba8b106cf --- /dev/null +++ b/pandas/src/ujson/python/ujson.c @@ -0,0 +1,41 @@ +#include +#include "version.h" + +/* objToJSON */ +PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs); +void initObjToJSON(void); + +/* JSONToObj */ +PyObject* JSONToObj(PyObject* self, PyObject *args, PyObject *kwargs); + +/* objToJSONFile */ +PyObject* objToJSONFile(PyObject* self, PyObject *args, PyObject *kwargs); + +/* JSONFileToObj */ +PyObject* JSONFileToObj(PyObject* self, PyObject *args, PyObject *kwargs); + + +static PyMethodDef ujsonMethods[] = { + {"encode", (PyCFunction) objToJSON, METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursivly into JSON. Use ensure_ascii=false to output UTF-8. Pass in double_precision to alter the maximum digit precision with doubles"}, + {"decode", (PyCFunction) JSONToObj, METH_VARARGS | METH_KEYWORDS, "Converts JSON as string to dict object structure"}, + {"dumps", (PyCFunction) objToJSON, METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursivly into JSON. Use ensure_ascii=false to output UTF-8"}, + {"loads", (PyCFunction) JSONToObj, METH_VARARGS | METH_KEYWORDS, "Converts JSON as string to dict object structure"}, + {"dump", (PyCFunction) objToJSONFile, METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursively into JSON file. Use ensure_ascii=false to output UTF-8"}, + {"load", (PyCFunction) JSONFileToObj, METH_VARARGS | METH_KEYWORDS, "Converts JSON as file to dict object structure"}, + {NULL, NULL, 0, NULL} /* Sentinel */ +}; + + + +PyMODINIT_FUNC +init_ujson(void) +{ + PyObject *module; + PyObject *version_string; + + initObjToJSON(); + module = Py_InitModule("_ujson", ujsonMethods); + + version_string = PyString_FromString (UJSON_VERSION); + PyModule_AddObject (module, "__version__", version_string); +} diff --git a/pandas/src/ujson/python/version.h b/pandas/src/ujson/python/version.h new file mode 100644 index 0000000000000..9449441411192 --- /dev/null +++ b/pandas/src/ujson/python/version.h @@ -0,0 +1 @@ +#define UJSON_VERSION "1.18" diff --git a/pandas/src/util.pxd b/pandas/src/util.pxd index c1c76b726a6d7..3ebd72cc83ee4 100644 --- a/pandas/src/util.pxd +++ b/pandas/src/util.pxd @@ -4,6 +4,7 @@ cimport numpy as cnp cdef extern from "numpy_helper.h": inline int is_integer_object(object) inline int is_float_object(object) + inline int is_complex_object(object) inline int is_bool_object(object) inline int is_string_object(object) inline int is_datetime64_object(object) @@ -60,4 +61,3 @@ cdef inline bint _checknull(object val): cdef inline bint _checknan(object val): return not cnp.PyArray_Check(val) and val != val - diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 06b0a6798e9b1..a64b880c3478e 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2,6 +2,31 @@ import numpy as np + import pandas.core.algorithms as algos import pandas.util.testing as tm + +class TestMatch(unittest.TestCase): + + def test_ints(self): + values = np.array([0, 2, 1]) + to_match = np.array([0, 1, 2, 2, 0, 1, 3, 0]) + + result = algos.match(to_match, values) + expected = np.array([0, 2, 1, 1, 0, 2, -1, 0]) + self.assert_(np.array_equal(result, expected)) + + def test_strings(self): + values = ['foo', 'bar', 'baz'] + to_match = ['bar', 'foo', 'qux', 'foo', 'bar', 'baz', 'qux'] + + result = algos.match(to_match, values) + expected = np.array([1, 0, -1, 0, 1, 2, -1]) + self.assert_(np.array_equal(result, expected)) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) + diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index b21bd09957bd7..4cda34cbc89ee 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -1,3 +1,4 @@ + # pylint: disable-msg=W0612,E1101 from copy import deepcopy from datetime import datetime, timedelta @@ -1322,6 +1323,10 @@ def test_set_columns(self): self.assertRaises(Exception, setattr, self.mixed_frame, 'columns', cols[::2]) + def test_keys(self): + getkeys = self.frame.keys + self.assert_(getkeys() is self.frame.columns) + def test_column_contains_typeerror(self): try: self.frame.columns in self.frame @@ -1601,6 +1606,73 @@ def test_constructor_maskedarray(self): frame = DataFrame(ma.masked_all((3, 0))) self.assert_(len(frame.columns) == 0) + def test_constructor_maskedarray_nonfloat(self): + # masked int promoted to float + mat = ma.masked_all((2, 3), dtype=int) + # 2-D input + frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) + + self.assertEqual(len(frame.index), 2) + self.assertEqual(len(frame.columns), 3) + self.assertTrue(np.all(~np.asarray(frame == frame))) + + # cast type + frame = DataFrame(mat, columns=['A', 'B', 'C'], + index=[1, 2], dtype=float) + self.assert_(frame.values.dtype == np.float64) + + # Check non-masked values + mat2 = ma.copy(mat) + mat2[0,0] = 1 + mat2[1,2] = 2 + frame = DataFrame(mat2, columns=['A', 'B', 'C'], index=[1, 2]) + self.assertEqual(1, frame['A'][1]) + self.assertEqual(2, frame['C'][2]) + + # masked np.datetime64 stays (use lib.NaT as null) + mat = ma.masked_all((2, 3), dtype='M8[ns]') + # 2-D input + frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) + + self.assertEqual(len(frame.index), 2) + self.assertEqual(len(frame.columns), 3) + self.assertTrue(isnull(frame).values.all()) + + # cast type + frame = DataFrame(mat, columns=['A', 'B', 'C'], + index=[1, 2], dtype=np.int64) + self.assert_(frame.values.dtype == np.int64) + + # Check non-masked values + mat2 = ma.copy(mat) + mat2[0,0] = 1 + mat2[1,2] = 2 + frame = DataFrame(mat2, columns=['A', 'B', 'C'], index=[1, 2]) + self.assertEqual(1, frame['A'].view('i8')[1]) + self.assertEqual(2, frame['C'].view('i8')[2]) + + # masked bool promoted to object + mat = ma.masked_all((2, 3), dtype=bool) + # 2-D input + frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) + + self.assertEqual(len(frame.index), 2) + self.assertEqual(len(frame.columns), 3) + self.assertTrue(np.all(~np.asarray(frame == frame))) + + # cast type + frame = DataFrame(mat, columns=['A', 'B', 'C'], + index=[1, 2], dtype=object) + self.assert_(frame.values.dtype == object) + + # Check non-masked values + mat2 = ma.copy(mat) + mat2[0,0] = True + mat2[1,2] = False + frame = DataFrame(mat2, columns=['A', 'B', 'C'], index=[1, 2]) + self.assertEqual(True, frame['A'][1]) + self.assertEqual(False, frame['C'][2]) + def test_constructor_corner(self): df = DataFrame(index=[]) self.assertEqual(df.values.shape, (0, 0)) @@ -1615,12 +1687,13 @@ def test_constructor_corner(self): def test_constructor_scalar_inference(self): data = {'int' : 1, 'bool' : True, - 'float' : 3., 'object' : 'foo'} + 'float' : 3., 'complex': 4j, 'object' : 'foo'} df = DataFrame(data, index=np.arange(10)) self.assert_(df['int'].dtype == np.int64) self.assert_(df['bool'].dtype == np.bool_) self.assert_(df['float'].dtype == np.float64) + self.assert_(df['complex'].dtype == np.complex128) self.assert_(df['object'].dtype == np.object_) def test_constructor_DataFrame(self): @@ -1924,6 +1997,143 @@ def test_to_dict(self): for k2, v2 in v.iteritems(): self.assertEqual(v2, recons_data[k][k2]) + def test_from_json_to_json(self): + + def _check_orient(df, orient, dtype=None, numpy=True): + df = df.sort() + dfjson = df.to_json(orient=orient) + unser = DataFrame.from_json(dfjson, orient=orient, dtype=dtype, + numpy=numpy) + unser = unser.sort() + mktimestamp = datetime.fromtimestamp + if df.index.dtype == np.datetime64: + unser.index = [mktimestamp(float(d)) for d in unser.index] + if orient == "records": + # index is not captured in this orientation + assert_almost_equal(df.values, unser.values) + self.assert_(df.columns.equals(unser.columns)) + elif orient == "values": + # index and cols are not captured in this orientation + assert_almost_equal(df.values, unser.values) + elif orient == "split": + # index and col labels might not be strings + unser.index = [str(i) for i in unser.index] + unser.columns = [str(i) for i in unser.columns] + unser = unser.sort() + assert_almost_equal(df.values, unser.values) + else: + assert_frame_equal(df, unser) + + def _check_all_orients(df, dtype=None): + _check_orient(df, "columns", dtype=dtype) + _check_orient(df, "records", dtype=dtype) + _check_orient(df, "split", dtype=dtype) + _check_orient(df, "index", dtype=dtype) + _check_orient(df, "values", dtype=dtype) + + _check_orient(df, "columns", dtype=dtype, numpy=False) + _check_orient(df, "records", dtype=dtype, numpy=False) + _check_orient(df, "split", dtype=dtype, numpy=False) + _check_orient(df, "index", dtype=dtype, numpy=False) + _check_orient(df, "values", dtype=dtype, numpy=False) + + # basic + _check_all_orients(self.frame) + self.assertEqual(self.frame.to_json(), + self.frame.to_json(orient="columns")) + + _check_all_orients(self.intframe, dtype=self.intframe.values.dtype) + + # big one + # index and columns are strings as all unserialised JSON object keys + # are assumed to be strings + biggie = DataFrame(np.zeros((200, 4)), + columns=[str(i) for i in range(4)], + index=[str(i) for i in range(200)]) + _check_all_orients(biggie) + + # dtypes + _check_all_orients(DataFrame(biggie, dtype=np.float64), + dtype=np.float64) + _check_all_orients(DataFrame(biggie, dtype=np.int), dtype=np.int) + _check_all_orients(DataFrame(biggie, dtype='>> A >>> B + key lvalue group key rvalue + 0 a 1 a 0 b 1 + 1 c 2 a 1 c 2 + 2 e 3 a 2 d 3 + 3 a 1 b + 4 c 2 b + 5 e 3 b + + >>> ordered_merge(A, B, fill_method='ffill', left_by='group') + key lvalue group rvalue + 0 a 1 a NaN + 1 b 1 a 1 + 2 c 2 a 2 + 3 d 2 a 3 + 4 e 3 a 3 + 5 f 3 a 4 + 6 a 1 b NaN + 7 b 1 b 1 + 8 c 2 b 2 + 9 d 2 b 3 + 10 e 3 b 3 + 11 f 3 b 4 + + Returns + ------- + merged : DataFrame + """ + def _merger(x, y): + op = _OrderedMerge(x, y, on=on, left_on=left_on, right_on=right_on, + # left_index=left_index, right_index=right_index, + suffixes=suffixes, fill_method=fill_method) + return op.get_result() + + if left_by is not None and right_by is not None: + raise ValueError('Can only group either left or right frames') + elif left_by is not None: + if not isinstance(left_by, (list, tuple)): + left_by = [left_by] + pieces = [] + for key, xpiece in left.groupby(left_by): + merged = _merger(xpiece, right) + for k in left_by: + # May have passed ndarray + try: + if k in merged: + merged[k] = key + except: + pass + pieces.append(merged) + return concat(pieces, ignore_index=True) + elif right_by is not None: + if not isinstance(right_by, (list, tuple)): + right_by = [right_by] + pieces = [] + for key, ypiece in right.groupby(right_by): + merged = _merger(left, ypiece) + for k in right_by: + try: + if k in merged: + merged[k] = key + except: + pass + pieces.append(merged) + return concat(pieces, ignore_index=True) + else: + return _merger(left, right) + + # TODO: NA group handling # TODO: transformations?? @@ -47,7 +150,7 @@ class _MergeOperation(object): def __init__(self, left, right, how='inner', on=None, left_on=None, right_on=None, axis=1, left_index=False, right_index=False, sort=True, - suffixes=('.x', '.y'), copy=True): + suffixes=('_x', '_y'), copy=True): self.left = self.orig_left = left self.right = self.orig_right = right self.how = how @@ -145,9 +248,6 @@ def _get_join_info(self): # max groups = largest possible number of distinct groups left_key, right_key, max_groups = self._get_group_keys() - # left_key = com._ensure_int64(left_key) - # right_key = com._ensure_int64(right_key) - join_func = _join_functions[self.how] left_indexer, right_indexer = join_func(left_key, right_key, max_groups) @@ -304,7 +404,7 @@ def _get_group_keys(self): group_sizes = [] for lk, rk in zip(left_keys, right_keys): - llab, rlab, count = _factorize_objects(lk, rk, sort=self.sort) + llab, rlab, count = _factorize_keys(lk, rk, sort=self.sort) left_labels.append(llab) right_labels.append(rlab) @@ -321,15 +421,58 @@ def _get_group_keys(self): raise Exception('Combinatorial explosion! (boom)') left_group_key, right_group_key, max_groups = \ - _factorize_int64(left_group_key, right_group_key, + _factorize_keys(left_group_key, right_group_key, sort=self.sort) return left_group_key, right_group_key, max_groups + +class _OrderedMerge(_MergeOperation): + + def __init__(self, left, right, on=None, by=None, left_on=None, + right_on=None, axis=1, left_index=False, right_index=False, + suffixes=('_x', '_y'), copy=True, + fill_method=None): + + self.fill_method = fill_method + + _MergeOperation.__init__(self, left, right, on=on, left_on=left_on, + right_on=right_on, axis=axis, + left_index=left_index, + right_index=right_index, + how='outer', suffixes=suffixes, + sort=True # sorts when factorizing + ) + + + def get_result(self): + join_index, left_indexer, right_indexer = self._get_join_info() + + # this is a bit kludgy + ldata, rdata = self._get_merge_data() + + if self.fill_method == 'ffill': + left_join_indexer = lib.ffill_indexer(left_indexer) + right_join_indexer = lib.ffill_indexer(right_indexer) + else: + left_join_indexer = left_indexer + right_join_indexer = right_indexer + + join_op = _BlockJoinOperation([ldata, rdata], join_index, + [left_join_indexer, right_join_indexer], + axis=1, copy=self.copy) + + result_data = join_op.get_result() + result = DataFrame(result_data) + + self._maybe_add_join_keys(result, left_indexer, right_indexer) + + return result + def _get_multiindex_indexer(join_keys, index, sort=False): shape = [] labels = [] for level, key in zip(index.levels, join_keys): - llab, rlab, count = _factorize_objects(level, key, sort=False) + llab, rlab, count = _factorize_keys(level, key, sort=False) labels.append(rlab) shape.append(count) @@ -337,8 +480,8 @@ def _get_multiindex_indexer(join_keys, index, sort=False): right_group_key = get_group_index(index.labels, shape) left_group_key, right_group_key, max_groups = \ - _factorize_int64(left_group_key, right_group_key, - sort=False) + _factorize_keys(left_group_key, right_group_key, + sort=False) left_indexer, right_indexer = \ lib.left_outer_join(com._ensure_int64(left_group_key), @@ -348,7 +491,7 @@ def _get_multiindex_indexer(join_keys, index, sort=False): return left_indexer, right_indexer def _get_single_indexer(join_key, index, sort=False): - left_key, right_key, count = _factorize_objects(join_key, index, sort=sort) + left_key, right_key, count = _factorize_keys(join_key, index, sort=sort) left_indexer, right_indexer = \ lib.left_outer_join(com._ensure_int64(left_key), @@ -357,10 +500,6 @@ def _get_single_indexer(join_key, index, sort=False): return left_indexer, right_indexer -def _right_outer_join(x, y, max_groups): - right_indexer, left_indexer = lib.left_outer_join(y, x, max_groups) - return left_indexer, right_indexer - def _left_join_on_index(left_ax, right_ax, join_keys, sort=False): join_index = left_ax left_indexer = None @@ -387,6 +526,10 @@ def _left_join_on_index(left_ax, right_ax, join_keys, sort=False): return join_index, left_indexer, right_indexer +def _right_outer_join(x, y, max_groups): + right_indexer, left_indexer = lib.left_outer_join(y, x, max_groups) + return left_indexer, right_indexer + _join_functions = { 'inner' : lib.inner_join, 'left' : lib.left_outer_join, @@ -394,26 +537,21 @@ def _left_join_on_index(left_ax, right_ax, join_keys, sort=False): 'outer' : lib.full_outer_join, } -def _factorize_int64(left_index, right_index, sort=True): - rizer = lib.Int64Factorizer(max(len(left_index), len(right_index))) - # 32-bit compatibility - left_index = com._ensure_int64(left_index) - right_index = com._ensure_int64(right_index) - - llab, _ = rizer.factorize(left_index) - rlab, _ = rizer.factorize(right_index) - - if sort: - llab, rlab = _sort_labels(np.array(rizer.uniques), llab, rlab) - - return llab, rlab, rizer.get_count() +def _factorize_keys(lk, rk, sort=True): + if com.is_integer_dtype(lk) and com.is_integer_dtype(rk): + klass = lib.Int64Factorizer + lk = com._ensure_int64(lk) + rk = com._ensure_int64(rk) + else: + klass = lib.Factorizer + lk = com._ensure_object(lk) + rk = com._ensure_object(rk) -def _factorize_objects(left_index, right_index, sort=True): - rizer = lib.Factorizer(max(len(left_index), len(right_index))) + rizer = klass(max(len(lk), len(rk))) - llab, _ = rizer.factorize(left_index.astype('O')) - rlab, _ = rizer.factorize(right_index.astype('O')) + llab, _ = rizer.factorize(lk) + rlab, _ = rizer.factorize(rk) count = rizer.get_count() diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 8168e1367f962..9fd3e5d173bf9 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -3,6 +3,7 @@ from itertools import izip import numpy as np +from scipy import stats from pandas.util.decorators import cache_readonly import pandas.core.common as com @@ -12,12 +13,19 @@ from pandas.tseries.offsets import DateOffset def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, - **kwds): + diagonal='hist', **kwds): """ Draw a matrix of scatter plots. Parameters ---------- + alpha : amount of transparency applied + figsize : a tuple (width, height) in inches + ax : Matplotlib axis object + grid : setting this to True will show the grid + diagonal : pick between 'kde' and 'hist' for + either Kernel Density Estimation or Histogram + plon in the diagonal kwds : other plotting keyword arguments To be passed to scatter function @@ -36,7 +44,18 @@ def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, for i, a in zip(range(n), df.columns): for j, b in zip(range(n), df.columns): - axes[i, j].scatter(df[b], df[a], alpha=alpha, **kwds) + if i == j: + # Deal with the diagonal by drawing a histogram there. + if diagonal == 'hist': + axes[i, j].hist(df[a]) + elif diagonal == 'kde': + y = df[a] + gkde = stats.gaussian_kde(y) + ind = np.linspace(min(y), max(y), 1000) + axes[i, j].plot(ind, gkde.evaluate(ind), **kwds) + else: + axes[i, j].scatter(df[b], df[a], alpha=alpha, **kwds) + axes[i, j].set_xlabel('') axes[i, j].set_ylabel('') axes[i, j].set_xticklabels([]) @@ -44,7 +63,7 @@ def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, ticks = df.index is_datetype = ticks.inferred_type in ('datetime', 'date', - 'datetime64') + 'datetime64') if ticks.is_numeric() or is_datetype: """ @@ -87,13 +106,6 @@ def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, axes[i, j].grid(b=grid) - # ensure {x,y}lim off diagonal are the same as diagonal - for i in range(n): - for j in range(n): - if i != j: - axes[i, j].set_xlim(axes[j, j].get_xlim()) - axes[i, j].set_ylim(axes[i, i].get_ylim()) - return axes def _gca(): @@ -326,6 +338,38 @@ def _get_xticks(self): return x +class KdePlot(MPLPlot): + def __init__(self, data, **kwargs): + MPLPlot.__init__(self, data, **kwargs) + + def _get_plot_function(self): + return self.plt.Axes.plot + + def _make_plot(self): + plotf = self._get_plot_function() + for i, (label, y) in enumerate(self._iter_data()): + if self.subplots: + ax = self.axes[i] + style = 'k' + else: + style = '' # empty string ignored + ax = self.ax + if self.style: + style = self.style + gkde = stats.gaussian_kde(y) + sample_range = max(y) - min(y) + ind = np.linspace(min(y) - 0.5 * sample_range, + max(y) + 0.5 * sample_range, 1000) + ax.set_ylabel("Density") + plotf(ax, ind, gkde.evaluate(ind), style, label=label, **self.kwds) + ax.grid(self.grid) + + def _post_plot_logic(self): + df = self.data + + if self.subplots and self.legend: + self.axes[0].legend(loc='best') + class LinePlot(MPLPlot): def __init__(self, data, **kwargs): @@ -670,6 +714,8 @@ def plot_series(series, label=None, kind='line', use_index=True, rot=None, klass = LinePlot elif kind in ('bar', 'barh'): klass = BarPlot + elif kind == 'kde': + klass = KdePlot if ax is None: ax = _gca() @@ -832,13 +878,16 @@ def hist_frame(data, grid=True, xlabelsize=None, xrot=None, """ import matplotlib.pyplot as plt n = len(data.columns) - k = 1 - while k ** 2 < n: - k += 1 - _, axes = _subplots(nrows=k, ncols=k, ax=ax, squeeze=False) + rows, cols = 1, 1 + while rows * cols < n: + if cols > rows: + rows += 1 + else: + cols += 1 + _, axes = _subplots(nrows=rows, ncols=cols, ax=ax, squeeze=False) for i, col in enumerate(com._try_sort(data.columns)): - ax = axes[i / k][i % k] + ax = axes[i / cols][i % cols] ax.xaxis.set_visible(True) ax.yaxis.set_visible(True) ax.hist(data[col].dropna().values, **kwds) @@ -854,8 +903,8 @@ def hist_frame(data, grid=True, xlabelsize=None, xrot=None, if yrot is not None: plt.setp(ax.get_yticklabels(), rotation=yrot) - for j in range(i + 1, k**2): - ax = axes[j / k, j % k] + for j in range(i + 1, rows * cols): + ax = axes[j / cols, j % cols] ax.set_visible(False) ax.get_figure().subplots_adjust(wspace=0.3, hspace=0.3) diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 4482e05295cd5..8253ad4e1e1db 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -4,12 +4,13 @@ import unittest from numpy.random import randn +from numpy import nan import numpy as np import random from pandas import * from pandas.tseries.index import DatetimeIndex -from pandas.tools.merge import merge, concat +from pandas.tools.merge import merge, concat, ordered_merge from pandas.util.testing import (assert_frame_equal, assert_series_equal, assert_almost_equal, rands) import pandas._tseries as lib @@ -463,8 +464,8 @@ def test_merge_overlap(self): merged = merge(self.left, self.left, on='key') exp_len = (self.left['key'].value_counts() ** 2).sum() self.assertEqual(len(merged), exp_len) - self.assert_('v1.x' in merged) - self.assert_('v1.y' in merged) + self.assert_('v1_x' in merged) + self.assert_('v1_y' in merged) def test_merge_different_column_key_names(self): left = DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], @@ -479,8 +480,8 @@ def test_merge_different_column_key_names(self): ['bar', 'baz', 'foo', 'foo', 'foo', 'foo', np.nan]) assert_almost_equal(merged['rkey'], ['bar', np.nan, 'foo', 'foo', 'foo', 'foo', 'qux']) - assert_almost_equal(merged['value.x'], [2, 3, 1, 1, 4, 4, np.nan]) - assert_almost_equal(merged['value.y'], [6, np.nan, 5, 8, 5, 8, 7]) + assert_almost_equal(merged['value_x'], [2, 3, 1, 1, 4, 4, np.nan]) + assert_almost_equal(merged['value_y'], [6, np.nan, 5, 8, 5, 8, 7]) def test_merge_nocopy(self): left = DataFrame({'a' : 0, 'b' : 1}, index=range(10)) @@ -656,7 +657,7 @@ def test_left_merge_na_buglet(self): tm.assert_frame_equal(merged, expected) def _check_join(left, right, result, join_col, how='left', - lsuffix='.x', rsuffix='.y'): + lsuffix='_x', rsuffix='_y'): # some smoke tests for c in join_col: @@ -1197,7 +1198,7 @@ def test_concat_series(self): result = concat(pieces, keys=[0, 1, 2]) expected = ts.copy() - ts.index = DatetimeIndex(np.array(ts.index.values, dtype='M8[us]')) + ts.index = DatetimeIndex(np.array(ts.index.values, dtype='M8[ns]')) exp_labels = [np.repeat([0, 1, 2], [len(x) for x in pieces]), np.arange(len(ts))] @@ -1248,6 +1249,56 @@ def test_mixed_type_join_with_suffix(self): # it works! mn.join(cn, rsuffix='_right') + +class TestOrderedMerge(unittest.TestCase): + + def setUp(self): + self.left = DataFrame({'key': ['a', 'c', 'e'], + 'lvalue': [1, 2., 3]}) + + self.right = DataFrame({'key': ['b', 'c', 'd', 'f'], + 'rvalue': [1, 2, 3., 4]}) + + # GH #813 + + def test_basic(self): + result = ordered_merge(self.left, self.right, on='key') + expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], + 'lvalue': [1, nan, 2, nan, 3, nan], + 'rvalue': [nan, 1, 2, 3, nan, 4]}) + + assert_frame_equal(result, expected) + + def test_ffill(self): + result = ordered_merge(self.left, self.right, on='key', fill_method='ffill') + expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], + 'lvalue': [1., 1, 2, 2, 3, 3.], + 'rvalue': [nan, 1, 2, 3, 3, 4]}) + assert_frame_equal(result, expected) + + def test_multigroup(self): + left = concat([self.left, self.left], ignore_index=True) + # right = concat([self.right, self.right], ignore_index=True) + + left['group'] = ['a'] * 3 + ['b'] * 3 + # right['group'] = ['a'] * 4 + ['b'] * 4 + + result = ordered_merge(left, self.right, on='key', left_by='group', + fill_method='ffill') + expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'] * 2, + 'lvalue': [1., 1, 2, 2, 3, 3.] * 2, + 'rvalue': [nan, 1, 2, 3, 3, 4] * 2}) + expected['group'] = ['a'] * 6 + ['b'] * 6 + + assert_frame_equal(result, expected.ix[:, result.columns]) + + result2 = ordered_merge(self.right, left, on='key', right_by='group', + fill_method='ffill') + assert_frame_equal(result, result2.ix[:, result.columns]) + + result = ordered_merge(left, self.right, on='key', left_by='group') + self.assert_(result['group'].notnull().all()) + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], diff --git a/pandas/tseries/api.py b/pandas/tseries/api.py index 1fb2be9a598d5..5a22fd7adde74 100644 --- a/pandas/tseries/api.py +++ b/pandas/tseries/api.py @@ -8,3 +8,4 @@ from pandas.tseries.offsets import * from pandas.tseries.period import PeriodIndex, period_range, pnow from pandas.tseries.resample import TimeGrouper +import pandas.tseries.offsets as offsets diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index e555700863dc9..6eb6e94872fee 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -68,7 +68,7 @@ def get_freq_code(freqstr): return code, stride -def _get_freq_str(base, mult): +def _get_freq_str(base, mult=1): code = _reverse_period_code_map.get(base) if code is None: return _unknown_freq @@ -696,6 +696,12 @@ def infer_freq(index, warn=True): inferer = _FrequencyInferer(index, warn=warn) return inferer.get_freq() +_ONE_MICRO = 1000L +_ONE_MILLI = _ONE_MICRO * 1000 +_ONE_SECOND = _ONE_MILLI * 1000 +_ONE_MINUTE = 60 * _ONE_SECOND +_ONE_HOUR = 60 * _ONE_MINUTE +_ONE_DAY = 24 * _ONE_HOUR class _FrequencyInferer(object): """ @@ -727,31 +733,34 @@ def is_monotonic(self): def get_freq(self): delta = self.deltas[0] - if _is_multiple(delta, _day_us): + if _is_multiple(delta, _ONE_DAY): return self._infer_daily_rule() else: # Possibly intraday frequency if not self.is_unique: return None - if _is_multiple(delta, 60 * 60 * 1000000): + if _is_multiple(delta, _ONE_HOUR): # Hours - return _maybe_add_count('H', delta / (60 * 60 * 1000000)) - elif _is_multiple(delta, 60 * 1000000): + return _maybe_add_count('H', delta / _ONE_HOUR) + elif _is_multiple(delta, _ONE_MINUTE): # Minutes - return _maybe_add_count('T', delta / (60 * 1000000)) - elif _is_multiple(delta, 1000000): + return _maybe_add_count('T', delta / _ONE_MINUTE) + elif _is_multiple(delta, _ONE_SECOND): # Seconds - return _maybe_add_count('S', delta / 1000000) - elif _is_multiple(delta, 1000): + return _maybe_add_count('S', delta / _ONE_SECOND) + elif _is_multiple(delta, _ONE_MILLI): # Milliseconds - return _maybe_add_count('L', delta / 1000) - else: + return _maybe_add_count('L', delta / _ONE_MILLI) + elif _is_multiple(delta, _ONE_MICRO): # Microseconds - return _maybe_add_count('U', delta) + return _maybe_add_count('U', delta / _ONE_MICRO) + else: + # Nanoseconds + return _maybe_add_count('N', delta) @cache_readonly def day_deltas(self): - return [x / _day_us for x in self.deltas] + return [x / _ONE_DAY for x in self.deltas] @cache_readonly def fields(self): @@ -828,7 +837,7 @@ def _infer_daily_rule(self): return monthly_rule if self.is_unique: - days = self.deltas[0] / _day_us + days = self.deltas[0] / _ONE_DAY if days % 7 == 0: # Weekly alias = _weekday_rule_aliases[self.rep_stamp.weekday()] @@ -951,6 +960,8 @@ def is_superperiod(source, target): return target not in ['D', 'B', 'H', 'T', 'S'] def _get_rule_month(source, default='DEC'): + if isinstance(source, offsets.DateOffset): + source = source.rule_code source = source.upper() if '-' not in source: return default @@ -988,5 +999,3 @@ def _is_weekly(rule): def _is_multiple(us, mult): return us % mult == 0 - -_day_us = 24 * 60 * 60 * 1000000 diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 96dc5f1c223d7..051477fa7027b 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1,3 +1,5 @@ +# pylint: disable=E1101 + from datetime import time, datetime from datetime import timedelta @@ -14,6 +16,7 @@ from pandas._tseries import Timestamp import pandas._tseries as lib +import pandas._algos as _algos def _utc(): import pytz @@ -63,7 +66,7 @@ def wrapper(left, right): results = joinf(left, right) if with_indexers: join_index, left_indexer, right_indexer = results - join_index = join_index.view('M8') + join_index = join_index.view('M8[ns]') return join_index, left_indexer, right_indexer return results return wrapper @@ -127,7 +130,6 @@ class DatetimeIndex(Int64Index): ---------- data : array-like (1-dimensional), optional Optional datetime-like data to construct index with - dtype : NumPy dtype (default: M8[us]) copy : bool Make a copy of input ndarray freq : string or pandas offset object, optional @@ -144,13 +146,13 @@ class DatetimeIndex(Int64Index): """ _join_precedence = 10 - _inner_indexer = _join_i8_wrapper(lib.inner_join_indexer_int64) - _outer_indexer = _join_i8_wrapper(lib.outer_join_indexer_int64) - _left_indexer = _join_i8_wrapper(lib.left_join_indexer_int64, + _inner_indexer = _join_i8_wrapper(_algos.inner_join_indexer_int64) + _outer_indexer = _join_i8_wrapper(_algos.outer_join_indexer_int64) + _left_indexer = _join_i8_wrapper(_algos.left_join_indexer_int64, with_indexers=False) _groupby = lib.groupby_arrays # _wrap_i8_function(lib.groupby_int64) - _arrmap = _wrap_dt_function(lib.arrmap_object) + _arrmap = _wrap_dt_function(_algos.arrmap_object) __eq__ = _dt_index_cmp('__eq__') __ne__ = _dt_index_cmp('__ne__') @@ -168,7 +170,7 @@ class DatetimeIndex(Int64Index): def __new__(cls, data=None, freq=None, start=None, end=None, periods=None, - dtype=None, copy=False, name=None, tz=None, + copy=False, name=None, tz=None, verify_integrity=True, normalize=False, **kwds): warn = False @@ -224,7 +226,7 @@ def __new__(cls, data=None, if lib.is_string_array(data): data = _str_to_dt_array(data, offset) else: - data = np.asarray(data, dtype='M8[us]') + data = np.asarray(data, dtype='M8[ns]') if issubclass(data.dtype.type, basestring): subarr = _str_to_dt_array(data, offset) @@ -234,11 +236,11 @@ def __new__(cls, data=None, offset = data.offset verify_integrity = False else: - subarr = np.array(data, dtype='M8[us]', copy=copy) + subarr = np.array(data, dtype='M8[ns]', copy=copy) elif issubclass(data.dtype.type, np.integer): - subarr = np.array(data, dtype='M8[us]', copy=copy) + subarr = np.array(data, dtype='M8[ns]', copy=copy) else: - subarr = np.array(data, dtype='M8[us]', copy=copy) + subarr = np.array(data, dtype='M8[ns]', copy=copy) if tz is not None: tz = tools._maybe_get_tz(tz) @@ -246,7 +248,7 @@ def __new__(cls, data=None, ints = subarr.view('i8') lib.tz_localize_check(ints, tz) subarr = lib.tz_convert(ints, tz, _utc()) - subarr = subarr.view('M8[us]') + subarr = subarr.view('M8[ns]') subarr = subarr.view(cls) subarr.name = name @@ -311,7 +313,7 @@ def _generate(cls, start, end, periods, name, offset, ints = index.view('i8') lib.tz_localize_check(ints, tz) index = lib.tz_convert(ints, tz, _utc()) - index = index.view('M8[us]') + index = index.view('M8[ns]') index = index.view(cls) index.name = name @@ -321,11 +323,11 @@ def _generate(cls, start, end, periods, name, offset, return index @classmethod - def _simple_new(cls, values, name, offset, tz): + def _simple_new(cls, values, name, freq=None, tz=None): result = values.view(cls) result.name = name - result.offset = offset - result.tz = tz + result.offset = freq + result.tz = tools._maybe_get_tz(tz) return result @@ -353,7 +355,7 @@ def _cached_range(cls, start=None, end=None, periods=None, offset=None, end=_CACHE_END) arr = np.array(_to_m8_array(list(xdr)), - dtype='M8[us]', copy=False) + dtype='M8[ns]', copy=False) cachedRange = arr.view(DatetimeIndex) cachedRange.offset = offset @@ -447,7 +449,7 @@ def __setstate__(self, state): # extract the raw datetime data, turn into datetime64 index_state = state[0] raw_data = index_state[0][4] - raw_data = np.array(raw_data, dtype='M8[us]') + raw_data = np.array(raw_data, dtype='M8[ns]') new_state = raw_data.__reduce__() np.ndarray.__setstate__(self, new_state[2]) else: # pragma: no cover @@ -475,8 +477,8 @@ def __sub__(self, other): def _add_delta(self, delta): if isinstance(delta, (Tick, timedelta)): - inc = offsets._delta_to_microseconds(delta) - new_values = (self.asi8 + inc).view('M8[us]') + inc = offsets._delta_to_nanoseconds(delta) + new_values = (self.asi8 + inc).view('M8[ns]') else: new_values = self.astype('O') + delta return DatetimeIndex(new_values, tz=self.tz, freq='infer') @@ -495,6 +497,13 @@ def summary(self, name=None): return result + def astype(self, dtype): + dtype = np.dtype(dtype) + + if dtype == np.object_: + return self.asobject + return Index.astype(self, dtype) + @property def asi8(self): # do not cache or you'll create a memory leak @@ -544,7 +553,6 @@ def order(self, return_indexer=False, ascending=True): return self._simple_new(sorted_values, self.name, None, self.tz) - def snap(self, freq='S'): """ Snap time stamps to nearest occuring frequency @@ -553,7 +561,7 @@ def snap(self, freq='S'): # Superdumb, punting on any optimizing freq = to_offset(freq) - snapped = np.empty(len(self), dtype='M8[us]') + snapped = np.empty(len(self), dtype='M8[ns]') for i, v in enumerate(self): s = v @@ -564,7 +572,7 @@ def snap(self, freq='S'): s = t0 else: s = t1 - snapped[i] = np.datetime64(s) + snapped[i] = s # we know it conforms; skip check return DatetimeIndex(snapped, freq=freq, verify_integrity=False) @@ -632,6 +640,12 @@ def union(self, other): ------- y : Index or DatetimeIndex """ + if not isinstance(other, DatetimeIndex): + try: + other = DatetimeIndex(other) + except TypeError: + pass + this, other = self._maybe_utc_convert(other) if this._can_fast_union(other): @@ -878,8 +892,8 @@ def _indices_at_time(self, key): # TODO: time object with tzinfo? - mus = _time_to_microsecond(key) - indexer = lib.values_at_time(self.asi8, mus) + nanos = _time_to_nanosecond(key) + indexer = lib.values_at_time(self.asi8, nanos) return com._ensure_platform_int(indexer) def _get_string_slice(self, key): @@ -989,7 +1003,7 @@ def __iter__(self): def searchsorted(self, key, side='left'): if isinstance(key, np.ndarray): - key = np.array(key, dtype='M8[us]', copy=False) + key = np.array(key, dtype='M8[ns]', copy=False) else: key = _to_m8(key) @@ -1014,7 +1028,7 @@ def _constructor(self): @property def dtype(self): - return np.dtype('M8') + return np.dtype('M8[ns]') @property def is_all_dates(self): @@ -1106,7 +1120,7 @@ def tz_localize(self, tz): # Convert to UTC new_dates = lib.tz_convert(self.asi8, tz, _utc()) - new_dates = new_dates.view('M8[us]') + new_dates = new_dates.view('M8[ns]') return self._simple_new(new_dates, self.name, self.offset, tz) def tz_validate(self): @@ -1137,7 +1151,7 @@ def _generate_regular_range(start, end, periods, offset): raise ValueError('Must specify two of start, end, or periods') if isinstance(offset, Tick): - stride = offset.us_stride() + stride = offset.nanos if periods is None: b = Timestamp(start).value e = Timestamp(end).value @@ -1152,12 +1166,12 @@ def _generate_regular_range(start, end, periods, offset): raise NotImplementedError data = np.arange(b, e, stride, dtype=np.int64) - data = data.view('M8[us]') + data = data.view('M8[ns]') else: xdr = generate_range(start=start, end=end, periods=periods, offset=offset) - data = np.array(list(xdr), dtype='M8[us]') + data = np.array(list(xdr), dtype='M8[ns]') return data @@ -1215,8 +1229,7 @@ def _dt_box_array(arr, offset=None, tz=None): return arr boxfunc = lambda x: Timestamp(x, offset=offset, tz=tz) - boxer = np.frompyfunc(boxfunc, 1, 1) - return boxer(arr) + return lib.map_infer(arr, boxfunc) def _to_m8(key): @@ -1227,7 +1240,10 @@ def _to_m8(key): # this also converts strings key = Timestamp(key) - return np.datetime64(lib.pydt_to_i8(key)) + if np.__version__[:3] == '1.6': + return np.datetime64(lib.pydt_to_i8(key)) + else: + return np.datetime64(lib.pydt_to_i8(key), 'us') def _to_m8_array(arr): @@ -1243,7 +1259,7 @@ def parser(x): p_ufunc = np.frompyfunc(parser, 1, 1) data = p_ufunc(arr) - return np.array(data, dtype='M8[us]') + return np.array(data, dtype='M8[ns]') _CACHE_START = Timestamp(datetime(1950, 1, 1)) @@ -1261,6 +1277,6 @@ def _naive_in_cache_range(start, end): def _in_range(start, end, rng_start, rng_end): return start > rng_start and end < rng_end -def _time_to_microsecond(time): +def _time_to_nanosecond(time): seconds = time.hour * 60 * 60 + 60 * time.minute + time.second - return 1000000 * seconds + time.microsecond + return (1000000 * seconds + time.microsecond) * 1000 diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index c1d915a04453c..fe268003c1109 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -4,6 +4,7 @@ from pandas.core.common import _count_not_none from pandas.tseries.tools import to_datetime +from pandas.util.decorators import cache_readonly # import after tools, dateutil check from dateutil.relativedelta import relativedelta @@ -408,7 +409,7 @@ def __init__(self, n=1, **kwds): raise Exception('Day must be 0<=day<=6, got %d' % self.weekday) - self.delta = timedelta(weeks=1) + self._inc = timedelta(weeks=1) self.kwds = kwds def isAnchored(self): @@ -416,7 +417,7 @@ def isAnchored(self): def apply(self, other): if self.weekday is None: - return other + self.n * self.delta + return other + self.n * self._inc if self.n > 0: k = self.n @@ -425,14 +426,14 @@ def apply(self, other): other = other + timedelta((self.weekday - otherDay) % 7) k = k - 1 for i in xrange(k): - other = other + self.delta + other = other + self._inc else: k = self.n otherDay = other.weekday() if otherDay != self.weekday: other = other + timedelta((self.weekday - otherDay) % 7) for i in xrange(-k): - other = other - self.delta + other = other - self._inc return other def onOffset(self, dt): @@ -919,7 +920,6 @@ def rule_code(self): # Ticks class Tick(DateOffset): - _delta = None _inc = timedelta(microseconds=1000) def __add__(self, other): @@ -955,15 +955,13 @@ def __ne__(self, other): else: return DateOffset.__ne__(self, other) - @property + @cache_readonly def delta(self): - if self._delta is None: - self._delta = self.n * self._inc - - return self._delta + return self.n * self._inc - def us_stride(self): - return _delta_to_microseconds(self.delta) + @property + def nanos(self): + return _delta_to_nanoseconds(self.delta) def apply(self, other): if isinstance(other, (datetime, timedelta)): @@ -989,23 +987,29 @@ def _delta_to_tick(delta): else: return Second(seconds) else: - mus = _delta_to_microseconds(delta) - if mus % 1000 == 0: - return Milli(mus // 1000) + nanos = _delta_to_nanoseconds(delta) + if nanos % 1000000 == 0: + return Milli(nanos // 1000000) + elif nanos % 1000 == 0: + return Micro(nanos // 1000) else: - return Micro(mus) + return Nano(nanos) -def _delta_to_microseconds(delta): +def _delta_to_nanoseconds(delta): if isinstance(delta, Tick): delta = delta.delta return (delta.days * 24 * 60 * 60 * 1000000 + delta.seconds * 1000000 - + delta.microseconds) + + delta.microseconds) * 1000 class Day(Tick, CacheableOffset): _inc = timedelta(1) _rule_base = 'D' + def isAnchored(self): + + return False + class Hour(Tick): _inc = timedelta(0, 3600) _rule_base = 'H' @@ -1025,6 +1029,10 @@ class Micro(Tick): _inc = timedelta(microseconds=1) _rule_base = 'U' +class Nano(Tick): + _inc = 1 + _rule_base = 'N' + BDay = BusinessDay BMonthEnd = BusinessMonthEnd BMonthBegin = BusinessMonthBegin diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index f443ab6d99924..5cae2375cf54a 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -10,7 +10,6 @@ import pandas.tseries.frequencies as _freq_mod import pandas.core.common as com -from pandas.util import py3compat from pandas._tseries import Timestamp import pandas._tseries as lib @@ -25,8 +24,7 @@ def _period_field_accessor(name, alias=None): alias = name def f(self): base, mult = _gfc(self.freq) - g = getattr(lib, 'get_period_%s' % alias) - return g(self.ordinal, base, mult) + return lib.get_period_field(alias, self.ordinal, base) f.__name__ = name return property(f) @@ -35,8 +33,7 @@ def _field_accessor(name, alias=None): alias = name def f(self): base, mult = _gfc(self.freq) - g = getattr(lib, 'get_period_%s_arr' % alias) - return g(self.values, base, mult) + return lib.get_period_field_arr(alias, self.values, base) f.__name__ = name return property(f) @@ -99,8 +96,6 @@ def __init__(self, value=None, freq=None, ordinal=None, elif ordinal is not None: if not com.is_integer(ordinal): raise ValueError("Ordinal must be an integer") - if ordinal <= 0: - raise ValueError("Ordinal must be positive") if freq is None: raise ValueError('Must supply freq for ordinal value') self.ordinal = ordinal @@ -113,6 +108,8 @@ def __init__(self, value=None, freq=None, ordinal=None, raise ValueError("If value is None, year cannot be None") base, mult = _gfc(freq) + if mult != 1: + raise ValueError('Only mult == 1 supported') if quarter is not None: mnum = _month_numbers[_freq_mod._get_rule_month(freq)] + 1 @@ -121,7 +118,7 @@ def __init__(self, value=None, freq=None, ordinal=None, year -= 1 self.ordinal = lib.period_ordinal(year, month, day, hour, minute, - second, base, mult) + second, base) elif isinstance(value, Period): other = value @@ -168,12 +165,15 @@ def __init__(self, value=None, freq=None, ordinal=None, raise ValueError(msg) base, mult = _gfc(freq) + if mult != 1: + raise ValueError('Only mult == 1 supported') if self.ordinal is None: - self.ordinal = lib.period_ordinal(dt.year, dt.month, dt.day, dt.hour, - dt.minute, dt.second, base, mult) + self.ordinal = lib.period_ordinal(dt.year, dt.month, dt.day, + dt.hour, dt.minute, dt.second, + base) - self.freq = _freq_mod._get_freq_str(base, mult) + self.freq = _freq_mod._get_freq_str(base) def __eq__(self, other): if isinstance(other, Period): @@ -215,14 +215,16 @@ def asfreq(self, freq=None, how='E'): base1, mult1 = _gfc(self.freq) base2, mult2 = _gfc(freq) + if mult2 != 1: + raise ValueError('Only mult == 1 supported') + if how not in ('S', 'E'): raise ValueError('relation argument must be one of S or E') end = how == 'E' - new_ordinal = lib.period_asfreq(self.ordinal, base1, mult1, - base2, mult2, end) + new_ordinal = lib.period_asfreq(self.ordinal, base1, base2, end) - return Period(ordinal=new_ordinal, freq=(base2, mult2)) + return Period(ordinal=new_ordinal, freq=base2) @property def start_time(self): @@ -255,23 +257,27 @@ def to_timestamp(self, freq=None, how='S'): else: base, mult = _gfc(freq) new_val = self.asfreq(freq, how) - dt64 = lib.period_ordinal_to_dt64(new_val.ordinal, base, mult) + + if mult != 1: + raise ValueError('Only mult == 1 supported') + + dt64 = lib.period_ordinal_to_dt64(new_val.ordinal, base) ts_freq = _period_rule_to_timestamp_rule(new_val.freq, how=how) return Timestamp(dt64, offset=to_offset(ts_freq)) - year = _period_field_accessor('year') - month = _period_field_accessor('month') - day = _period_field_accessor('day') - hour = _period_field_accessor('hour') - minute = _period_field_accessor('minute') - second = _period_field_accessor('second') - weekofyear = _period_field_accessor('week') + year = _period_field_accessor('year', 0) + month = _period_field_accessor('month', 3) + day = _period_field_accessor('day', 4) + hour = _period_field_accessor('hour', 5) + minute = _period_field_accessor('minute', 6) + second = _period_field_accessor('second', 7) + weekofyear = _period_field_accessor('week', 8) week = weekofyear - dayofweek = _period_field_accessor('dayofweek', 'dow') + dayofweek = _period_field_accessor('dayofweek', 10) weekday = dayofweek - dayofyear = day_of_year = _period_field_accessor('dayofyear', 'doy') - quarter = _period_field_accessor('quarter') - qyear = _period_field_accessor('qyear') + dayofyear = day_of_year = _period_field_accessor('dayofyear', 9) + quarter = _period_field_accessor('quarter', 2) + qyear = _period_field_accessor('qyear', 1) @classmethod def now(cls, freq=None): @@ -279,15 +285,13 @@ def now(cls, freq=None): def __repr__(self): base, mult = _gfc(self.freq) - formatted = lib.period_ordinal_to_string(self.ordinal, base, mult) + formatted = lib.period_ordinal_to_string(self.ordinal, base) freqstr = _freq_mod._reverse_period_code_map[base] - if mult == 1: - return "Period('%s', '%s')" % (formatted, freqstr) - return ("Period('%s', '%d%s')" % (formatted, mult, freqstr)) + return "Period('%s', '%s')" % (formatted, freqstr) def __str__(self): base, mult = _gfc(self.freq) - formatted = lib.period_ordinal_to_string(self.ordinal, base, mult) + formatted = lib.period_ordinal_to_string(self.ordinal, base) return ("%s" % formatted) def strftime(self, fmt): @@ -429,9 +433,9 @@ def strftime(self, fmt): """ base, mult = _gfc(self.freq) if fmt is not None: - return lib.period_strftime(self.ordinal, base, mult, fmt) + return lib.period_strftime(self.ordinal, base, fmt) else: - return lib.period_ordinal_to_string(self.ordinal, base, mult) + return lib.period_ordinal_to_string(self.ordinal, base) def _period_unbox(key, check=None): ''' @@ -462,6 +466,9 @@ def _period_box_array(arr, freq): return boxer(arr) def dt64arr_to_periodarr(data, freq): + if data.dtype != np.dtype('M8[ns]'): + raise ValueError('Wrong dtype: %s' % data.dtype) + if data is None: return data @@ -470,10 +477,34 @@ def dt64arr_to_periodarr(data, freq): else: base, mult = freq - return lib.dt64arr_to_periodarr(data.view('i8'), base, mult) + return lib.dt64arr_to_periodarr(data.view('i8'), base) # --- Period index sketch + +def _period_index_cmp(opname): + """ + Wrap comparison operations to convert datetime-like to datetime64 + """ + def wrapper(self, other): + if isinstance(other, Period): + func = getattr(self.values, opname) + assert(other.freq == self.freq) + result = func(other.ordinal) + elif isinstance(other, PeriodIndex): + assert(other.freq == self.freq) + return getattr(self.values, opname)(other.values) + else: + other = Period(other, freq=self.freq) + func = getattr(self.values, opname) + result = func(other.ordinal) + try: + return result.view(np.ndarray) + except: + return result + return wrapper + + class PeriodIndex(Int64Index): """ Immutable ndarray holding ordinal values indicating regular periods in @@ -511,6 +542,13 @@ class PeriodIndex(Int64Index): """ _box_scalars = True + __eq__ = _period_index_cmp('__eq__') + __ne__ = _period_index_cmp('__ne__') + __lt__ = _period_index_cmp('__lt__') + __gt__ = _period_index_cmp('__gt__') + __le__ = _period_index_cmp('__le__') + __ge__ = _period_index_cmp('__ge__') + def __new__(cls, data=None, freq=None, start=None, end=None, periods=None, copy=False, name=None): @@ -563,8 +601,7 @@ def __new__(cls, data=None, else: base1, mult1 = _gfc(data.freq) base2, mult2 = _gfc(freq) - data = lib.period_asfreq_arr(data.values, base1, mult1, - base2, mult2, 1) + data = lib.period_asfreq_arr(data.values, base1, base2, 1) else: if freq is None and len(data) > 0: freq = getattr(data[0], 'freq') @@ -573,7 +610,7 @@ def __new__(cls, data=None, raise ValueError(('freq not specified and cannot be ' 'inferred from first element')) - if data.dtype == np.datetime64: + if issubclass(data.dtype.type, np.datetime_): data = dt64arr_to_periodarr(data, freq) elif data.dtype == np.int64: pass @@ -622,6 +659,19 @@ def __iter__(self): def is_all_dates(self): return True + @property + def is_full(self): + """ + Returns True if there are any missing periods from start to end + """ + if len(self) == 0: + return True + if not self.is_monotonic: + raise ValueError('Index is not monotonic') + values = self.values + return ((values[1:] - values[:-1]) < 2).all() + + @property def freqstr(self): return self.freq @@ -638,31 +688,33 @@ def asfreq(self, freq=None, how='E'): else: base2, mult2 = freq + if mult2 != 1: + raise ValueError('Only mult == 1 supported') + if how not in ('S', 'E'): raise ValueError('relation argument must be one of S or E') end = how == 'E' - new_data = lib.period_asfreq_arr(self.values, base1, mult1, - base2, mult2, end) + new_data = lib.period_asfreq_arr(self.values, base1, base2, end) result = new_data.view(PeriodIndex) result.name = self.name result.freq = freq return result - year = _field_accessor('year') - month = _field_accessor('month') - day = _field_accessor('day') - hour = _field_accessor('hour') - minute = _field_accessor('minute') - second = _field_accessor('second') - weekofyear = _field_accessor('week') + year = _field_accessor('year', 0) + month = _field_accessor('month', 3) + day = _field_accessor('day', 4) + hour = _field_accessor('hour', 5) + minute = _field_accessor('minute', 6) + second = _field_accessor('second', 7) + weekofyear = _field_accessor('week', 8) week = weekofyear - dayofweek = _field_accessor('dayofweek', 'dow') + dayofweek = _field_accessor('dayofweek', 10) weekday = dayofweek - dayofyear = day_of_year = _field_accessor('dayofyear', 'doy') - quarter = _field_accessor('quarter') - qyear = _field_accessor('qyear') + dayofyear = day_of_year = _field_accessor('dayofyear', 9) + quarter = _field_accessor('quarter', 2) + qyear = _field_accessor('qyear', 1) # Try to run function on index first, and then on elements of index # Especially important for group-by functionality @@ -693,12 +745,14 @@ def to_timestamp(self, freq=None, how='start'): if freq is None: base, mult = _gfc(self.freq) new_data = self - # freq = self.freq else: base, mult = _gfc(freq) new_data = self.asfreq(freq, how) - # freq = 'infer' - new_data = lib.periodarr_to_dt64arr(new_data.values, base, mult) + + if mult != 1: + raise ValueError('Only mult == 1 supported') + + new_data = lib.periodarr_to_dt64arr(new_data.values, base) return DatetimeIndex(new_data, freq='infer') def shift(self, n): @@ -748,7 +802,7 @@ def get_value(self, series, key): """ try: return super(PeriodIndex, self).get_value(series, key) - except KeyError: + except (KeyError, IndexError): try: asdt, parsed, reso = parse_time_string(key, self.freq) grp = _freq_mod._infer_period_group(reso) diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 4cd548dc9120a..f1109dd52f395 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -37,7 +37,7 @@ class TimeGrouper(CustomGrouper): def __init__(self, freq='Min', closed='right', label='right', how='mean', begin=None, end=None, nperiods=None, axis=0, fill_method=None, limit=None, loffset=None, kind=None, - convention=None): + convention=None, base=0): self.freq = freq self.closed = closed self.label = label @@ -51,12 +51,20 @@ def __init__(self, freq='Min', closed='right', label='right', how='mean', self.how = how self.fill_method = fill_method self.limit = limit + self.base = base def resample(self, obj): axis = obj._get_axis(self.axis) if isinstance(axis, DatetimeIndex): return self._resample_timestamps(obj) elif isinstance(axis, PeriodIndex): + offset = to_offset(self.freq) + if offset.n > 1: + if self.kind == 'period': # pragma: no cover + print 'Warning: multiple of frequency -> timestamps' + # Cannot have multiple of periods, convert to timestamp + self.kind = 'timestamp' + if self.kind is None or self.kind == 'period': return self._resample_periods(obj) else: @@ -81,9 +89,33 @@ def _get_time_grouper(self, obj): return binner, grouper def _get_time_bins(self, axis): - return _make_time_bins(axis, self.freq, begin=self.begin, - end=self.end, closed=self.closed, - label=self.label) + assert(isinstance(axis, DatetimeIndex)) + + if len(axis) == 0: + # TODO: Should we be a bit more careful here? + return [], [], [] + + first, last = _get_range_edges(axis, self.begin, self.end, self.freq, + closed=self.closed, base=self.base) + binner = DatetimeIndex(freq=self.freq, start=first, end=last) + + # a little hack + trimmed = False + if len(binner) > 2 and binner[-2] == axis[-1]: + binner = binner[:-1] + trimmed = True + + # general version, knowing nothing about relative frequencies + bins = lib.generate_bins_dt64(axis.asi8, binner.asi8, self.closed) + + if self.label == 'right': + labels = binner[1:] + elif not trimmed: + labels = binner[:-1] + else: + labels = binner + + return binner, bins, labels def _get_time_period_bins(self, axis): return _make_period_bins(axis, self.freq, begin=self.begin, @@ -203,42 +235,23 @@ def _make_period_bins(axis, freq, begin=None, end=None, return binner, bins, labels -def _make_time_bins(axis, freq, begin=None, end=None, - closed='right', label='right'): - assert(isinstance(axis, DatetimeIndex)) - - if len(axis) == 0: - # TODO: Should we be a bit more careful here? - return [], [], [] - - first, last = _get_range_edges(axis, begin, end, freq, closed=closed) - binner = DatetimeIndex(freq=freq, start=first, end=last) - - # a little hack - trimmed = False - if len(binner) > 2 and binner[-2] == axis[-1]: - binner = binner[:-1] - trimmed = True - - # general version, knowing nothing about relative frequencies - bins = lib.generate_bins_dt64(axis.asi8, binner.asi8, closed) - - if label == 'right': - labels = binner[1:] - elif not trimmed: - labels = binner[:-1] - else: - labels = binner - - return binner, bins, labels - -def _get_range_edges(axis, begin, end, offset, closed='left'): +def _get_range_edges(axis, begin, end, offset, closed='left', + base=0): + from pandas.tseries.offsets import Tick, _delta_to_nanoseconds if isinstance(offset, basestring): offset = to_offset(offset) if not isinstance(offset, DateOffset): raise ValueError("Rule not a recognized offset") + if isinstance(offset, Tick): + day_nanos = _delta_to_nanoseconds(timedelta(1)) + # #1165 + if ((day_nanos % offset.nanos) == 0 and begin is None + and end is None): + return _adjust_dates_anchored(axis[0], axis[-1], offset, + closed=closed, base=base) + if begin is None: if closed == 'left': first = Timestamp(offset.rollback(axis[0])) @@ -249,12 +262,54 @@ def _get_range_edges(axis, begin, end, offset, closed='left'): if end is None: last = Timestamp(axis[-1] + offset) - # last = Timestamp(offset.rollforward(axis[-1])) else: last = Timestamp(offset.rollforward(end)) return first, last + +def _adjust_dates_anchored(first, last, offset, closed='right', base=0): + from pandas.tseries.tools import normalize_date + + start_day_nanos = Timestamp(normalize_date(first)).value + last_day_nanos = Timestamp(normalize_date(last)).value + + base_nanos = (base % offset.n) * offset.nanos // offset.n + start_day_nanos += base_nanos + last_day_nanos += base_nanos + + foffset = (first.value - start_day_nanos) % offset.nanos + loffset = (last.value - last_day_nanos) % offset.nanos + + if closed == 'right': + if foffset > 0: + # roll back + fresult = first.value - foffset + else: + fresult = first.value - offset.nanos + + if loffset > 0: + # roll forward + lresult = last.value + (offset.nanos - loffset) + else: + # already the end of the road + lresult = last.value + else: # closed == 'left' + if foffset > 0: + fresult = first.value - foffset + else: + # start of the road + fresult = first.value + + if loffset > 0: + # roll forward + lresult = last.value + (offset.nanos - loffset) + else: + lresult = last.value + offset.nanos + + return Timestamp(fresult), Timestamp(lresult) + + def asfreq(obj, freq, method=None, how=None): """ Utility frequency conversion method for Series/DataFrame @@ -306,11 +361,11 @@ def values_at_time(obj, time, tz=None, asof=False): # TODO: time object with tzinfo? - mus = _time_to_microsecond(time) + mus = _time_to_nanosecond(time) indexer = lib.values_at_time(obj.index.asi8, mus) indexer = com._ensure_platform_int(indexer) return obj.take(indexer) -def _time_to_microsecond(time): +def _time_to_nanosecond(time): seconds = time.hour * 60 * 60 + 60 * time.minute + time.second - return 1000000 * seconds + time.microsecond + return 1000000000L * seconds + time.microsecond * 1000 diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 771d6387c127a..1e897eb73c284 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -8,6 +8,7 @@ from unittest import TestCase from datetime import datetime, timedelta +import unittest from numpy.ma.testutils import assert_equal @@ -35,6 +36,7 @@ def test_period_cons_quarterly(self): for month in MONTHS: freq = 'Q-%s' % month exp = Period('1989Q3', freq=freq) + self.assert_('1989Q3' in str(exp)) stamp = exp.to_timestamp('D', how='end') p = Period(stamp, freq=freq) self.assertEquals(p, exp) @@ -144,7 +146,7 @@ def test_period_constructor(self): self.assertEqual(i1, expected) i1 = Period(ordinal=200701, freq='M') - self.assertEqual(i1.year, 16726) + self.assertEqual(i1.year, 18695) self.assertRaises(ValueError, Period, ordinal=200701) @@ -152,9 +154,6 @@ def test_freq_str(self): i1 = Period('1982', freq='Min') self.assert_(i1.freq[0] != '1') - i2 = Period('11/30/2005', freq='2Q') - self.assertEquals(i2.freq[0], '2') - def test_to_timestamp(self): p = Period('1982', freq='A') start_ts = p.to_timestamp(how='S') @@ -1060,29 +1059,29 @@ def test_index_duplicate_periods(self): assert_series_equal(result, expected) def test_constructor(self): - ii = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') - assert_equal(len(ii), 9) + pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + assert_equal(len(pi), 9) - ii = PeriodIndex(freq='Q', start='1/1/2001', end='12/1/2009') - assert_equal(len(ii), 4 * 9) + pi = PeriodIndex(freq='Q', start='1/1/2001', end='12/1/2009') + assert_equal(len(pi), 4 * 9) - ii = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') - assert_equal(len(ii), 12 * 9) + pi = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') + assert_equal(len(pi), 12 * 9) - ii = PeriodIndex(freq='D', start='1/1/2001', end='12/31/2009') - assert_equal(len(ii), 365 * 9 + 2) + pi = PeriodIndex(freq='D', start='1/1/2001', end='12/31/2009') + assert_equal(len(pi), 365 * 9 + 2) - ii = PeriodIndex(freq='B', start='1/1/2001', end='12/31/2009') - assert_equal(len(ii), 261 * 9) + pi = PeriodIndex(freq='B', start='1/1/2001', end='12/31/2009') + assert_equal(len(pi), 261 * 9) - ii = PeriodIndex(freq='H', start='1/1/2001', end='12/31/2001 23:00') - assert_equal(len(ii), 365 * 24) + pi = PeriodIndex(freq='H', start='1/1/2001', end='12/31/2001 23:00') + assert_equal(len(pi), 365 * 24) - ii = PeriodIndex(freq='Min', start='1/1/2001', end='1/1/2001 23:59') - assert_equal(len(ii), 24 * 60) + pi = PeriodIndex(freq='Min', start='1/1/2001', end='1/1/2001 23:59') + assert_equal(len(pi), 24 * 60) - ii = PeriodIndex(freq='S', start='1/1/2001', end='1/1/2001 23:59:59') - assert_equal(len(ii), 24 * 60 * 60) + pi = PeriodIndex(freq='S', start='1/1/2001', end='1/1/2001 23:59:59') + assert_equal(len(pi), 24 * 60 * 60) start = Period('02-Apr-2005', 'B') i1 = PeriodIndex(start=start, periods=20) @@ -1139,96 +1138,96 @@ def test_constructor(self): self.assertRaises(ValueError, PeriodIndex, vals) def test_shift(self): - ii1 = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') - ii2 = PeriodIndex(freq='A', start='1/1/2002', end='12/1/2010') - assert_equal(len(ii1), len(ii2)) - assert_equal(ii1.shift(1).values, ii2.values) - - ii1 = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') - ii2 = PeriodIndex(freq='A', start='1/1/2000', end='12/1/2008') - assert_equal(len(ii1), len(ii2)) - assert_equal(ii1.shift(-1).values, ii2.values) - - ii1 = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') - ii2 = PeriodIndex(freq='M', start='2/1/2001', end='1/1/2010') - assert_equal(len(ii1), len(ii2)) - assert_equal(ii1.shift(1).values, ii2.values) - - ii1 = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') - ii2 = PeriodIndex(freq='M', start='12/1/2000', end='11/1/2009') - assert_equal(len(ii1), len(ii2)) - assert_equal(ii1.shift(-1).values, ii2.values) - - ii1 = PeriodIndex(freq='D', start='1/1/2001', end='12/1/2009') - ii2 = PeriodIndex(freq='D', start='1/2/2001', end='12/2/2009') - assert_equal(len(ii1), len(ii2)) - assert_equal(ii1.shift(1).values, ii2.values) - - ii1 = PeriodIndex(freq='D', start='1/1/2001', end='12/1/2009') - ii2 = PeriodIndex(freq='D', start='12/31/2000', end='11/30/2009') - assert_equal(len(ii1), len(ii2)) - assert_equal(ii1.shift(-1).values, ii2.values) + pi1 = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='A', start='1/1/2002', end='12/1/2010') + assert_equal(len(pi1), len(pi2)) + assert_equal(pi1.shift(1).values, pi2.values) + + pi1 = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='A', start='1/1/2000', end='12/1/2008') + assert_equal(len(pi1), len(pi2)) + assert_equal(pi1.shift(-1).values, pi2.values) + + pi1 = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='M', start='2/1/2001', end='1/1/2010') + assert_equal(len(pi1), len(pi2)) + assert_equal(pi1.shift(1).values, pi2.values) + + pi1 = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='M', start='12/1/2000', end='11/1/2009') + assert_equal(len(pi1), len(pi2)) + assert_equal(pi1.shift(-1).values, pi2.values) + + pi1 = PeriodIndex(freq='D', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='D', start='1/2/2001', end='12/2/2009') + assert_equal(len(pi1), len(pi2)) + assert_equal(pi1.shift(1).values, pi2.values) + + pi1 = PeriodIndex(freq='D', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='D', start='12/31/2000', end='11/30/2009') + assert_equal(len(pi1), len(pi2)) + assert_equal(pi1.shift(-1).values, pi2.values) def test_asfreq(self): - ii1 = PeriodIndex(freq='A', start='1/1/2001', end='1/1/2001') - ii2 = PeriodIndex(freq='Q', start='1/1/2001', end='1/1/2001') - ii3 = PeriodIndex(freq='M', start='1/1/2001', end='1/1/2001') - ii4 = PeriodIndex(freq='D', start='1/1/2001', end='1/1/2001') - ii5 = PeriodIndex(freq='H', start='1/1/2001', end='1/1/2001 00:00') - ii6 = PeriodIndex(freq='Min', start='1/1/2001', end='1/1/2001 00:00') - ii7 = PeriodIndex(freq='S', start='1/1/2001', end='1/1/2001 00:00:00') - - self.assertEquals(ii1.asfreq('Q', 'S'), ii2) - self.assertEquals(ii1.asfreq('Q', 's'), ii2) - self.assertEquals(ii1.asfreq('M', 'start'), ii3) - self.assertEquals(ii1.asfreq('D', 'StarT'), ii4) - self.assertEquals(ii1.asfreq('H', 'beGIN'), ii5) - self.assertEquals(ii1.asfreq('Min', 'S'), ii6) - self.assertEquals(ii1.asfreq('S', 'S'), ii7) - - self.assertEquals(ii2.asfreq('A', 'S'), ii1) - self.assertEquals(ii2.asfreq('M', 'S'), ii3) - self.assertEquals(ii2.asfreq('D', 'S'), ii4) - self.assertEquals(ii2.asfreq('H', 'S'), ii5) - self.assertEquals(ii2.asfreq('Min', 'S'), ii6) - self.assertEquals(ii2.asfreq('S', 'S'), ii7) - - self.assertEquals(ii3.asfreq('A', 'S'), ii1) - self.assertEquals(ii3.asfreq('Q', 'S'), ii2) - self.assertEquals(ii3.asfreq('D', 'S'), ii4) - self.assertEquals(ii3.asfreq('H', 'S'), ii5) - self.assertEquals(ii3.asfreq('Min', 'S'), ii6) - self.assertEquals(ii3.asfreq('S', 'S'), ii7) - - self.assertEquals(ii4.asfreq('A', 'S'), ii1) - self.assertEquals(ii4.asfreq('Q', 'S'), ii2) - self.assertEquals(ii4.asfreq('M', 'S'), ii3) - self.assertEquals(ii4.asfreq('H', 'S'), ii5) - self.assertEquals(ii4.asfreq('Min', 'S'), ii6) - self.assertEquals(ii4.asfreq('S', 'S'), ii7) - - self.assertEquals(ii5.asfreq('A', 'S'), ii1) - self.assertEquals(ii5.asfreq('Q', 'S'), ii2) - self.assertEquals(ii5.asfreq('M', 'S'), ii3) - self.assertEquals(ii5.asfreq('D', 'S'), ii4) - self.assertEquals(ii5.asfreq('Min', 'S'), ii6) - self.assertEquals(ii5.asfreq('S', 'S'), ii7) - - self.assertEquals(ii6.asfreq('A', 'S'), ii1) - self.assertEquals(ii6.asfreq('Q', 'S'), ii2) - self.assertEquals(ii6.asfreq('M', 'S'), ii3) - self.assertEquals(ii6.asfreq('D', 'S'), ii4) - self.assertEquals(ii6.asfreq('H', 'S'), ii5) - self.assertEquals(ii6.asfreq('S', 'S'), ii7) - - self.assertEquals(ii7.asfreq('A', 'S'), ii1) - self.assertEquals(ii7.asfreq('Q', 'S'), ii2) - self.assertEquals(ii7.asfreq('M', 'S'), ii3) - self.assertEquals(ii7.asfreq('D', 'S'), ii4) - self.assertEquals(ii7.asfreq('H', 'S'), ii5) - self.assertEquals(ii7.asfreq('Min', 'S'), ii6) - - #self.assertEquals(ii7.asfreq('A', 'E'), i_end) + pi1 = PeriodIndex(freq='A', start='1/1/2001', end='1/1/2001') + pi2 = PeriodIndex(freq='Q', start='1/1/2001', end='1/1/2001') + pi3 = PeriodIndex(freq='M', start='1/1/2001', end='1/1/2001') + pi4 = PeriodIndex(freq='D', start='1/1/2001', end='1/1/2001') + pi5 = PeriodIndex(freq='H', start='1/1/2001', end='1/1/2001 00:00') + pi6 = PeriodIndex(freq='Min', start='1/1/2001', end='1/1/2001 00:00') + pi7 = PeriodIndex(freq='S', start='1/1/2001', end='1/1/2001 00:00:00') + + self.assertEquals(pi1.asfreq('Q', 'S'), pi2) + self.assertEquals(pi1.asfreq('Q', 's'), pi2) + self.assertEquals(pi1.asfreq('M', 'start'), pi3) + self.assertEquals(pi1.asfreq('D', 'StarT'), pi4) + self.assertEquals(pi1.asfreq('H', 'beGIN'), pi5) + self.assertEquals(pi1.asfreq('Min', 'S'), pi6) + self.assertEquals(pi1.asfreq('S', 'S'), pi7) + + self.assertEquals(pi2.asfreq('A', 'S'), pi1) + self.assertEquals(pi2.asfreq('M', 'S'), pi3) + self.assertEquals(pi2.asfreq('D', 'S'), pi4) + self.assertEquals(pi2.asfreq('H', 'S'), pi5) + self.assertEquals(pi2.asfreq('Min', 'S'), pi6) + self.assertEquals(pi2.asfreq('S', 'S'), pi7) + + self.assertEquals(pi3.asfreq('A', 'S'), pi1) + self.assertEquals(pi3.asfreq('Q', 'S'), pi2) + self.assertEquals(pi3.asfreq('D', 'S'), pi4) + self.assertEquals(pi3.asfreq('H', 'S'), pi5) + self.assertEquals(pi3.asfreq('Min', 'S'), pi6) + self.assertEquals(pi3.asfreq('S', 'S'), pi7) + + self.assertEquals(pi4.asfreq('A', 'S'), pi1) + self.assertEquals(pi4.asfreq('Q', 'S'), pi2) + self.assertEquals(pi4.asfreq('M', 'S'), pi3) + self.assertEquals(pi4.asfreq('H', 'S'), pi5) + self.assertEquals(pi4.asfreq('Min', 'S'), pi6) + self.assertEquals(pi4.asfreq('S', 'S'), pi7) + + self.assertEquals(pi5.asfreq('A', 'S'), pi1) + self.assertEquals(pi5.asfreq('Q', 'S'), pi2) + self.assertEquals(pi5.asfreq('M', 'S'), pi3) + self.assertEquals(pi5.asfreq('D', 'S'), pi4) + self.assertEquals(pi5.asfreq('Min', 'S'), pi6) + self.assertEquals(pi5.asfreq('S', 'S'), pi7) + + self.assertEquals(pi6.asfreq('A', 'S'), pi1) + self.assertEquals(pi6.asfreq('Q', 'S'), pi2) + self.assertEquals(pi6.asfreq('M', 'S'), pi3) + self.assertEquals(pi6.asfreq('D', 'S'), pi4) + self.assertEquals(pi6.asfreq('H', 'S'), pi5) + self.assertEquals(pi6.asfreq('S', 'S'), pi7) + + self.assertEquals(pi7.asfreq('A', 'S'), pi1) + self.assertEquals(pi7.asfreq('Q', 'S'), pi2) + self.assertEquals(pi7.asfreq('M', 'S'), pi3) + self.assertEquals(pi7.asfreq('D', 'S'), pi4) + self.assertEquals(pi7.asfreq('H', 'S'), pi5) + self.assertEquals(pi7.asfreq('Min', 'S'), pi6) + + #self.assertEquals(pi7.asfreq('A', 'E'), i_end) def test_ts_repr(self): index = PeriodIndex(freq='A', start='1/1/2001', end='12/31/2010') @@ -1260,18 +1259,18 @@ def test_badinput(self): def test_dti_to_period(self): dti = DatetimeIndex(start='1/1/2005', end='12/1/2005', freq='M') - ii1 = dti.to_period() - ii2 = dti.to_period(freq='D') + pi1 = dti.to_period() + pi2 = dti.to_period(freq='D') - self.assertEquals(ii1[0], Period('Jan 2005', freq='M')) - self.assertEquals(ii2[0], Period('1/31/2005', freq='D')) + self.assertEquals(pi1[0], Period('Jan 2005', freq='M')) + self.assertEquals(pi2[0], Period('1/31/2005', freq='D')) - self.assertEquals(ii1[-1], Period('Nov 2005', freq='M')) - self.assertEquals(ii2[-1], Period('11/30/2005', freq='D')) + self.assertEquals(pi1[-1], Period('Nov 2005', freq='M')) + self.assertEquals(pi2[-1], Period('11/30/2005', freq='D')) - def test_iindex_slice_index(self): - ii = PeriodIndex(start='1/1/10', end='12/31/12', freq='M') - s = Series(np.random.rand(len(ii)), index=ii) + def test_pindex_slice_index(self): + pi = PeriodIndex(start='1/1/10', end='12/31/12', freq='M') + s = Series(np.random.rand(len(pi)), index=pi) res = s['2010'] exp = s[0:12] assert_series_equal(res, exp) @@ -1279,20 +1278,20 @@ def test_iindex_slice_index(self): exp = s[12:24] assert_series_equal(res, exp) - def test_iindex_qaccess(self): - ii = PeriodIndex(['2Q05', '3Q05', '4Q05', '1Q06', '2Q06'], freq='Q') - s = Series(np.random.rand(len(ii)), index=ii).cumsum() + def test_pindex_qaccess(self): + pi = PeriodIndex(['2Q05', '3Q05', '4Q05', '1Q06', '2Q06'], freq='Q') + s = Series(np.random.rand(len(pi)), index=pi).cumsum() # Todo: fix these accessors! self.assert_(s['05Q4'] == s[2]) def test_period_dt64_round_trip(self): dti = date_range('1/1/2000', '1/7/2002', freq='B') - ii = dti.to_period() - self.assert_(ii.to_timestamp().equals(dti)) + pi = dti.to_period() + self.assert_(pi.to_timestamp().equals(dti)) dti = date_range('1/1/2000', '1/7/2002', freq='B') - ii = dti.to_period(freq='H') - self.assert_(ii.to_timestamp().equals(dti)) + pi = dti.to_period(freq='H') + self.assert_(pi.to_timestamp().equals(dti)) def test_to_period_quarterly(self): # make sure we can make the round trip @@ -1303,27 +1302,35 @@ def test_to_period_quarterly(self): result = stamps.to_period(freq) self.assert_(rng.equals(result)) - def test_iindex_multiples(self): - ii = PeriodIndex(start='1/1/10', end='12/31/12', freq='2M') - self.assertEquals(ii[0], Period('1/1/10', '2M')) - self.assertEquals(ii[1], Period('3/1/10', '2M')) + def test_no_multiples(self): + self.assertRaises(ValueError, period_range, '1989Q3', periods=10, + freq='2Q') + + self.assertRaises(ValueError, period_range, '1989', periods=10, + freq='2A') + self.assertRaises(ValueError, Period, '1989', freq='2A') + + # def test_pindex_multiples(self): + # pi = PeriodIndex(start='1/1/10', end='12/31/12', freq='2M') + # self.assertEquals(pi[0], Period('1/1/10', '2M')) + # self.assertEquals(pi[1], Period('3/1/10', '2M')) - self.assertEquals(ii[0].asfreq('6M'), ii[2].asfreq('6M')) - self.assertEquals(ii[0].asfreq('A'), ii[2].asfreq('A')) + # self.assertEquals(pi[0].asfreq('6M'), pi[2].asfreq('6M')) + # self.assertEquals(pi[0].asfreq('A'), pi[2].asfreq('A')) - self.assertEquals(ii[0].asfreq('M', how='S'), - Period('Jan 2010', '1M')) - self.assertEquals(ii[0].asfreq('M', how='E'), - Period('Feb 2010', '1M')) - self.assertEquals(ii[1].asfreq('M', how='S'), - Period('Mar 2010', '1M')) + # self.assertEquals(pi[0].asfreq('M', how='S'), + # Period('Jan 2010', '1M')) + # self.assertEquals(pi[0].asfreq('M', how='E'), + # Period('Feb 2010', '1M')) + # self.assertEquals(pi[1].asfreq('M', how='S'), + # Period('Mar 2010', '1M')) - i = Period('1/1/2010 12:05:18', '5S') - self.assertEquals(i, Period('1/1/2010 12:05:15', '5S')) + # i = Period('1/1/2010 12:05:18', '5S') + # self.assertEquals(i, Period('1/1/2010 12:05:15', '5S')) - i = Period('1/1/2010 12:05:18', '5S') - self.assertEquals(i.asfreq('1S', how='E'), - Period('1/1/2010 12:05:19', '1S')) + # i = Period('1/1/2010 12:05:18', '5S') + # self.assertEquals(i.asfreq('1S', how='E'), + # Period('1/1/2010 12:05:19', '1S')) def test_iteration(self): index = PeriodIndex(start='1/1/10', periods=4, freq='B') @@ -1418,33 +1425,33 @@ def test_fields(self): # year, month, day, hour, minute # second, weekofyear, week, dayofweek, weekday, dayofyear, quarter # qyear - ii = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') - self._check_all_fields(ii) + pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + self._check_all_fields(pi) - ii = PeriodIndex(freq='Q', start='1/1/2001', end='12/1/2003') - self._check_all_fields(ii) + pi = PeriodIndex(freq='Q', start='1/1/2001', end='12/1/2003') + self._check_all_fields(pi) - ii = PeriodIndex(freq='M', start='1/1/2001', end='1/1/2002') - self._check_all_fields(ii) + pi = PeriodIndex(freq='M', start='1/1/2001', end='1/1/2002') + self._check_all_fields(pi) - ii = PeriodIndex(freq='D', start='12/1/2001', end='1/1/2002') - self._check_all_fields(ii) + pi = PeriodIndex(freq='D', start='12/1/2001', end='1/1/2002') + self._check_all_fields(pi) - ii = PeriodIndex(freq='B', start='12/1/2001', end='1/1/2002') - self._check_all_fields(ii) + pi = PeriodIndex(freq='B', start='12/1/2001', end='1/1/2002') + self._check_all_fields(pi) - ii = PeriodIndex(freq='H', start='12/31/2001', end='1/1/2002 23:00') - self._check_all_fields(ii) + pi = PeriodIndex(freq='H', start='12/31/2001', end='1/1/2002 23:00') + self._check_all_fields(pi) - ii = PeriodIndex(freq='Min', start='12/31/2001', end='1/1/2002 00:59') - self._check_all_fields(ii) + pi = PeriodIndex(freq='Min', start='12/31/2001', end='1/1/2002 00:59') + self._check_all_fields(pi) - ii = PeriodIndex(freq='S', start='12/31/2001', end='1/1/2001 00:00:01') - self._check_all_fields(ii) + pi = PeriodIndex(freq='S', start='12/31/2001', end='1/1/2001 00:00:01') + self._check_all_fields(pi) end_intv = Period('2006-12-31', 'W') i1 = PeriodIndex(end=end_intv, periods=10) - self._check_all_fields(ii) + self._check_all_fields(pi) def _check_all_fields(self, periodindex): fields = ['year', 'month', 'day', 'hour', 'minute', @@ -1458,6 +1465,23 @@ def _check_field(self, periodindex, fieldname): for x, val in zip(periodindex, field_idx): assert_equal(getattr(x, fieldname), val) + def test_is_full(self): + index = PeriodIndex([2005, 2007, 2009], freq='A') + self.assert_(not index.is_full) + + index = PeriodIndex([2005, 2006, 2007], freq='A') + self.assert_(index.is_full) + + index = PeriodIndex([2005, 2005, 2007], freq='A') + self.assert_(not index.is_full) + + index = PeriodIndex([2005, 2005, 2006], freq='A') + self.assert_(index.is_full) + + index = PeriodIndex([2006, 2005, 2005], freq='A') + self.assertRaises(ValueError, getattr, index, 'is_full') + + self.assert_(index[:0].is_full) def _permute(obj): return obj.take(np.random.permutation(len(obj))) @@ -1478,8 +1502,57 @@ def test_add(self): self.assertRaises(ValueError, dt1.__add__, dt2) -############################################################################### -#------------------------------------------------------------------------------ +class TestPeriodRepresentation(unittest.TestCase): + """ + Wish to match NumPy units + """ + + def test_annual(self): + self._check_freq('A', 1970) + + def test_monthly(self): + self._check_freq('M', '1970-01') + + def test_weekly(self): + self._check_freq('W-THU', '1970-01-01') + + def test_daily(self): + self._check_freq('D', '1970-01-01') + + def test_business_daily(self): + self._check_freq('B', '1970-01-01') + + def test_hourly(self): + self._check_freq('H', '1970-01-01') + + def test_minutely(self): + self._check_freq('T', '1970-01-01') + + def test_secondly(self): + self._check_freq('S', '1970-01-01') + + def _check_freq(self, freq, base_date): + rng = PeriodIndex(start=base_date, periods=10, freq=freq) + exp = np.arange(10, dtype=np.int64) + self.assert_(np.array_equal(rng.values, exp)) + + def test_negone_ordinals(self): + freqs = ['A', 'M', 'Q', 'D','H', 'T', 'S'] + + period = Period(ordinal=-1, freq='D') + for freq in freqs: + repr(period.asfreq(freq)) + + for freq in freqs: + period = Period(ordinal=-1, freq=freq) + repr(period) + self.assertEquals(period.year, 1969) + + period = Period(ordinal=-1, freq='B') + repr(period) + period = Period(ordinal=-1, freq='W') + repr(period) + if __name__ == '__main__': import nose diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 3b35921ede5ba..ce568f5a98162 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -54,7 +54,7 @@ def test_custom_grouper(self): # construct expected val arr = [1] + [5] * 2592 idx = dti[0:-1:5] - idx = idx.append(DatetimeIndex([np.datetime64(dti[-1])])) + idx = idx.append(dti[-1:]) expect = Series(arr, index=idx) # cython returns float for now @@ -347,6 +347,40 @@ def test_resample_panel_numpy(self): expected = panel.resample('M', how='mean', axis=1) tm.assert_panel_equal(result, expected) + def test_resample_anchored_ticks(self): + # If a fixed delta (5 minute, 4 hour) evenly divides a day, we should + # "anchor" the origin at midnight so we get regular intervals rather + # than starting from the first timestamp which might start in the middle + # of a desired interval + + rng = date_range('1/1/2000 04:00:00', periods=86400, freq='s') + ts = Series(np.random.randn(len(rng)), index=rng) + ts[:2] = np.nan # so results are the same + + freqs = ['t', '5t', '15t', '30t', '4h', '12h'] + for freq in freqs: + result = ts[2:].resample(freq, closed='left', label='left') + expected = ts.resample(freq, closed='left', label='left') + assert_series_equal(result, expected) + + def test_resample_base(self): + rng = date_range('1/1/2000 00:00:00', '1/1/2000 02:00', freq='s') + ts = Series(np.random.randn(len(rng)), index=rng) + + resampled = ts.resample('5min', base=2) + exp_rng = date_range('1/1/2000 00:02:00', '1/1/2000 02:02', + freq='5min') + self.assert_(resampled.index.equals(exp_rng)) + + def test_resample_daily_anchored(self): + rng = date_range('1/1/2000 0:00:00', periods=10000, freq='T') + ts = Series(np.random.randn(len(rng)), index=rng) + ts[:2] = np.nan # so results are the same + + result = ts[2:].resample('D', closed='left', label='left') + expected = ts.resample('D', closed='left', label='left') + assert_series_equal(result, expected) + def _simple_ts(start, end, freq='D'): rng = date_range(start, end, freq=freq) @@ -502,6 +536,15 @@ def test_cant_fill_missing_dups(self): s = TimeSeries(np.random.randn(5), index=rng) self.assertRaises(Exception, s.resample, 'A') + def test_resample_5minute(self): + rng = period_range('1/1/2000', '1/5/2000', freq='T') + ts = TimeSeries(np.random.randn(len(rng)), index=rng) + + result = ts.resample('5min') + expected = ts.to_timestamp().resample('5min') + assert_series_equal(result, expected) + + class TestTimeGrouper(unittest.TestCase): def setUp(self): diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 4eff342e58c6e..1868b56176af5 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -54,7 +54,7 @@ def test_is_unique_monotonic(self): def test_index_unique(self): uniques = self.dups.index.unique() - self.assert_(uniques.dtype == 'M8') # sanity + self.assert_(uniques.dtype == 'M8[ns]') # sanity def test_duplicate_dates_indexing(self): ts = self.dups @@ -132,6 +132,12 @@ def test_getitem_median_slice_bug(self): expected = s[indexer[0]] assert_series_equal(result, expected) + def test_series_box_timestamp(self): + rng = date_range('20090415', '20090519', freq='B') + s = Series(rng) + + self.assert_(isinstance(s[5], Timestamp)) + def test_series_ctor_plus_datetimeindex(self): rng = date_range('20090415', '20090519', freq='B') data = dict((k, 1) for k in rng) @@ -304,7 +310,7 @@ def test_frame_ctor_datetime64_column(self): dates = np.asarray(rng) df = DataFrame({'A': np.random.randn(len(rng)), 'B': dates}) - self.assert_(np.issubdtype(df['B'].dtype, np.datetime64)) + self.assert_(np.issubdtype(df['B'].dtype, np.dtype('M8[ns]'))) def test_frame_add_datetime64_column(self): rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', @@ -312,7 +318,7 @@ def test_frame_add_datetime64_column(self): df = DataFrame(index=np.arange(len(rng))) df['A'] = rng - self.assert_(np.issubdtype(df['A'].dtype, np.datetime64)) + self.assert_(np.issubdtype(df['A'].dtype, np.dtype('M8[ns]'))) def test_series_ctor_datetime64(self): rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', @@ -320,14 +326,14 @@ def test_series_ctor_datetime64(self): dates = np.asarray(rng) series = Series(dates) - self.assert_(np.issubdtype(series.dtype, np.datetime64)) + self.assert_(np.issubdtype(series.dtype, np.dtype('M8[ns]'))) def test_reindex_series_add_nat(self): rng = date_range('1/1/2000 00:00:00', periods=10, freq='10s') series = Series(rng) result = series.reindex(range(15)) - self.assert_(np.issubdtype(result.dtype, np.datetime64)) + self.assert_(np.issubdtype(result.dtype, np.dtype('M8[ns]'))) mask = result.isnull() self.assert_(mask[-5:].all()) @@ -338,14 +344,14 @@ def test_reindex_frame_add_nat(self): df = DataFrame({'A': np.random.randn(len(rng)), 'B': rng}) result = df.reindex(range(15)) - self.assert_(np.issubdtype(result['B'].dtype, np.datetime64)) + self.assert_(np.issubdtype(result['B'].dtype, np.dtype('M8[ns]'))) mask = com.isnull(result)['B'] self.assert_(mask[-5:].all()) self.assert_(not mask[:-5].any()) def test_series_repr_nat(self): - series = Series([0, 1, 2, NaT], dtype='M8[us]') + series = Series([0, 1000, 2000, NaT], dtype='M8[ns]') result = repr(series) expected = ('0 1970-01-01 00:00:00\n' @@ -355,20 +361,20 @@ def test_series_repr_nat(self): self.assertEquals(result, expected) def test_fillna_nat(self): - series = Series([0, 1, 2, NaT], dtype='M8[us]') + series = Series([0, 1, 2, NaT], dtype='M8[ns]') filled = series.fillna(method='pad') - filled2 = series.fillna(value=series[2]) + filled2 = series.fillna(value=series.values[2]) expected = series.copy() - expected[3] = expected[2] + expected.values[3] = expected.values[2] assert_series_equal(filled, expected) assert_series_equal(filled2, expected) df = DataFrame({'A': series}) filled = df.fillna(method='pad') - filled2 = df.fillna(value=series[2]) + filled2 = df.fillna(value=series.values[2]) expected = DataFrame({'A': expected}) assert_frame_equal(filled, expected) assert_frame_equal(filled2, expected) @@ -381,7 +387,7 @@ def test_string_na_nat_conversion(self): strings = np.array(['1/1/2000', '1/2/2000', np.nan, '1/4/2000, 12:34:56'], dtype=object) - expected = np.empty(4, dtype='M8') + expected = np.empty(4, dtype='M8[ns]') for i, val in enumerate(strings): if com.isnull(val): expected[i] = NaT @@ -411,7 +417,7 @@ def test_string_na_nat_conversion(self): result = to_datetime(series) dresult = to_datetime(dseries) - expected = Series(np.empty(5, dtype='M8[us]'), index=idx) + expected = Series(np.empty(5, dtype='M8[ns]'), index=idx) for i in range(5): x = series[i] if isnull(x): @@ -653,6 +659,22 @@ def test_datetimeindex_integers_shift(self): expected = rng.shift(-5) self.assert_(result.equals(expected)) + def test_astype_object(self): + # NumPy 1.6.1 weak ns support + rng = date_range('1/1/2000', periods=20) + + casted = rng.astype('O') + exp_values = list(rng) + + self.assert_(np.array_equal(casted, exp_values)) + + + def test_catch_infinite_loop(self): + offset = datetools.DateOffset(minute=5) + # blow up, don't loop forever + self.assertRaises(Exception, date_range, datetime(2011,11,11), + datetime(2011,11,12), freq=offset) + def _simple_ts(start, end, freq='D'): rng = date_range(start, end, freq=freq) @@ -875,7 +897,7 @@ def test_date_range_normalize(self): offset = timedelta(2) values = np.array([snap + i * offset for i in range(n)], - dtype='M8[us]') + dtype='M8[ns]') self.assert_(np.array_equal(rng, values)) @@ -976,8 +998,7 @@ def setUp(self): self.series = Series(rand(len(dti)), dti) def test_datetimeindex_accessors(self): - dti = DatetimeIndex(freq='Q-JAN', start=datetime(1997,12,31), - periods=100) + dti = DatetimeIndex(freq='Q-JAN', start=datetime(1997,12,31), periods=100) self.assertEquals(dti.year[0], 1998) self.assertEquals(dti.month[0], 1) @@ -1063,11 +1084,11 @@ def test_datetimeindex_constructor(self): idx4 = DatetimeIndex(arr) arr = np.array(['1/1/2005', '1/2/2005', '1/3/2005', - '2005-01-04'], dtype='M8[us]') + '2005-01-04'], dtype='M8[ns]') idx5 = DatetimeIndex(arr) arr = np.array(['1/1/2005', '1/2/2005', 'Jan 3, 2005', - '2005-01-04'], dtype='M8[us]') + '2005-01-04'], dtype='M8[ns]') idx6 = DatetimeIndex(arr) for other in [idx2, idx3, idx4, idx5, idx6]: @@ -1110,7 +1131,7 @@ def test_dti_reset_index_round_trip(self): dti = DatetimeIndex(start='1/1/2001', end='6/1/2001', freq='D') d1 = DataFrame({'v' : np.random.rand(len(dti))}, index=dti) d2 = d1.reset_index() - self.assert_(d2.dtypes[0] == np.datetime64) + self.assert_(d2.dtypes[0] == np.dtype('M8[ns]')) d3 = d2.set_index('index') assert_frame_equal(d1, d3) @@ -1128,6 +1149,53 @@ def test_datetimeindex_union_join_empty(self): # TODO: test merge & concat with datetime64 block +class TestTimestamp(unittest.TestCase): + + def test_basics_nanos(self): + arr = np.array(['1/1/2000'], dtype='M8[ns]') + stamp = Timestamp(arr[0].view('i8') + 500) + self.assert_(stamp.year == 2000) + self.assert_(stamp.month == 1) + self.assert_(stamp.microsecond == 0) + self.assert_(stamp.nanosecond == 500) + + def test_comparison(self): + # 5-18-2012 00:00:00.000 + stamp = 1337299200000000000L + + val = Timestamp(stamp) + + self.assert_(val == val) + self.assert_(not val != val) + self.assert_(not val < val) + self.assert_(val <= val) + self.assert_(not val > val) + self.assert_(val >= val) + + other = datetime(2012, 5, 18) + self.assert_(val == other) + self.assert_(not val != other) + self.assert_(not val < other) + self.assert_(val <= other) + self.assert_(not val > other) + self.assert_(val >= other) + + other = Timestamp(stamp + 100) + + self.assert_(not val == other) + self.assert_(val != other) + self.assert_(val < other) + self.assert_(val <= other) + self.assert_(other > val) + self.assert_(other >= val) + + def test_delta_preserve_nanos(self): + val = Timestamp(1337299200000000123L) + result = val + timedelta(1) + self.assert_(result.nanosecond == val.nanosecond) + +""" + class TestNewOffsets(unittest.TestCase): def test_yearoffset(self): @@ -1320,12 +1388,7 @@ def test_dayofmonthoffset(self): self.assert_(t.weekday() == day) - def test_catch_infinite_loop(self): - offset = datetools.DateOffset(minute=5) - # blow up, don't loop forever - self.assertRaises(Exception, date_range, datetime(2011,11,11), - datetime(2011,11,12), freq=offset) - +""" if __name__ == '__main__': nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], diff --git a/pandas/tseries/tests/test_util.py b/pandas/tseries/tests/test_util.py index 38d812915d0f7..02a98858ed808 100644 --- a/pandas/tseries/tests/test_util.py +++ b/pandas/tseries/tests/test_util.py @@ -6,17 +6,17 @@ from pandas import Series, date_range import pandas.util.testing as tm -from pandas.tseries.util import convert_to_annual, isleapyear +from pandas.tseries.util import pivot_annual, isleapyear -class TestConvertAnnual(unittest.TestCase): +class TestPivotAnnual(unittest.TestCase): """ - New pandas of scikits.timeseries convert_to_annual + New pandas of scikits.timeseries pivot_annual """ def test_daily(self): rng = date_range('1/1/2000', '12/31/2004', freq='D') ts = Series(np.random.randn(len(rng)), index=rng) - annual = convert_to_annual(ts, 'D') + annual = pivot_annual(ts, 'D') doy = ts.index.dayofyear doy[(-isleapyear(ts.index.year)) & (doy >= 60)] += 1 @@ -40,7 +40,7 @@ def test_monthly(self): rng = date_range('1/1/2000', '12/31/2004', freq='M') ts = Series(np.random.randn(len(rng)), index=rng) - annual = convert_to_annual(ts, 'M') + annual = pivot_annual(ts, 'M') month = ts.index.month @@ -49,13 +49,13 @@ def test_monthly(self): subset.index = [x.year for x in subset.index] tm.assert_series_equal(annual[i].dropna(), subset) - def test_interval_monthly(self): + def test_period_monthly(self): pass - def test_interval_daily(self): + def test_period_daily(self): pass - def test_interval_weekly(self): + def test_period_weekly(self): pass if __name__ == '__main__': diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index bfeec9e8081da..d7a296df8655b 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -135,6 +135,8 @@ def parse_time_string(arg, freq=None): """ from pandas.core.format import print_config from pandas.tseries.offsets import DateOffset + from pandas.tseries.frequencies import (_get_rule_month, _month_numbers, + _get_freq_str) if not isinstance(arg, basestring): return arg @@ -165,7 +167,17 @@ def parse_time_string(arg, freq=None): y = int(y_str) if add_century: y += 2000 - ret = default.replace(year=y, month=(q-1)*3+1) + + if freq is not None: + # hack attack, #1228 + mnum = _month_numbers[_get_rule_month(freq)] + 1 + month = (mnum + (q - 1) * 3) % 12 + 1 + if month > mnum: + y -= 1 + else: + month = (q - 1) * 3 + 1 + + ret = default.replace(year=y, month=month) return ret, ret, 'quarter' is_mo_str = freq is not None and freq == 'M' diff --git a/pandas/tseries/util.py b/pandas/tseries/util.py index c3b4b8272d5b9..2163deaf3c102 100644 --- a/pandas/tseries/util.py +++ b/pandas/tseries/util.py @@ -3,7 +3,7 @@ from pandas.core.frame import DataFrame import pandas.core.nanops as nanops -def convert_to_annual(series, freq=None): +def pivot_annual(series, freq=None): """ Group a series by years, taking leap years into account. diff --git a/setup.py b/setup.py index 75747e1d9fb2f..761f86135f22c 100755 --- a/setup.py +++ b/setup.py @@ -74,6 +74,8 @@ msg = "pandas requires NumPy >= 1.6 due to datetime64 dependency" sys.exit(msg) +from numpy.distutils.misc_util import get_pkg_info + from distutils.extension import Extension from distutils.command.build import build from distutils.command.build_ext import build_ext @@ -335,7 +337,7 @@ def run(self): cmdclass['sdist'] = CheckSDist tseries_depends = ['reindex', 'groupby', 'skiplist', 'moments', - 'generated', 'reduce', 'stats', 'datetime', + 'reduce', 'stats', 'datetime', 'hashtable', 'inference', 'properties', 'join', 'engines'] def srcpath(name=None, suffix='.pyx', subdir='src'): @@ -348,21 +350,46 @@ def srcpath(name=None, suffix='.pyx', subdir='src'): else: tseries_depends = [] +algos_ext = Extension('pandas._algos', + sources=[srcpath('generated', suffix=suffix)], + include_dirs=[np.get_include()], + ) + tseries_ext = Extension('pandas._tseries', - depends=tseries_depends + ['pandas/src/numpy_helper.h'], - sources=[srcpath('tseries', suffix=suffix), - 'pandas/src/period.c', - 'pandas/src/np_datetime.c', - 'pandas/src/np_datetime_strings.c'], - include_dirs=[np.get_include()], - # pyrex_gdb=True, - # extra_compile_args=['-Wconversion'] - ) + depends=tseries_depends + ['pandas/src/numpy_helper.h'], + sources=[srcpath('tseries', suffix=suffix), + 'pandas/src/period.c', + 'pandas/src/np_datetime.c', + 'pandas/src/np_datetime_strings.c'], + include_dirs=[np.get_include()], + # pyrex_gdb=True, + # extra_compile_args=['-Wconversion'] + ) + +# tseries_ext = Extension('pandas._tseries', +# depends=tseries_depends + ['pandas/src/numpy_helper.h'], +# sources=[srcpath('datetime', suffix=suffix)], +# include_dirs=[np.get_include()], +# # pyrex_gdb=True, +# # extra_compile_args=['-Wconversion'] +# ) + sparse_ext = Extension('pandas._sparse', sources=[srcpath('sparse', suffix=suffix)], include_dirs=[np.get_include()]) +ujson_ext = Extension('pandas._ujson', + sources=['pandas/src/ujson/python/ujson.c', + 'pandas/src/ujson/python/objToJSON.c', + 'pandas/src/ujson/python/JSONtoObj.c', + 'pandas/src/ujson/lib/ultrajsonenc.c', + 'pandas/src/ujson/lib/ultrajsondec.c'], + include_dirs=['pandas/src/ujson/python', + 'pandas/src/ujson/lib', + np.get_include()], + extra_link_args=[get_pkg_info('npymath').libs()]) + sandbox_ext = Extension('pandas._sandbox', sources=[srcpath('sandbox', suffix=suffix), 'pandas/src/period.c', @@ -374,7 +401,7 @@ def srcpath(name=None, suffix='.pyx', subdir='src'): sources=[srcpath('cppsandbox', suffix=suffix)], include_dirs=[np.get_include()]) -extensions = [tseries_ext, sparse_ext] +extensions = [algos_ext, tseries_ext, sparse_ext, ujson_ext] if not ISRELEASED: extensions.extend([sandbox_ext]) diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py index f8e5790796bbb..f690135e02e62 100644 --- a/vb_suite/groupby.py +++ b/vb_suite/groupby.py @@ -144,3 +144,51 @@ def f(): groupby_pivot_table = Benchmark(stmt, setup, start_date=datetime(2011, 12, 15)) +#---------------------------------------------------------------------- +# dict return values + +setup = common_setup + """ +labels = np.arange(1000).repeat(10) +data = Series(randn(len(labels))) +f = lambda x: {'first': x.values[0], 'last': x.values[-1]} +""" + +groupby_apply_dict_return = Benchmark('data.groupby(labels).apply(f)', + setup, start_date=datetime(2011, 12, 15)) + +#---------------------------------------------------------------------- +# First / last functions + +setup = common_setup + """ +labels = np.arange(10000).repeat(10) +data = Series(randn(len(labels))) +data[::3] = np.nan +data[1::3] = np.nan +labels = labels.take(np.random.permutation(len(labels))) +""" + +groupby_first = Benchmark('data.groupby(labels).first()', setup, + start_date=datetime(2012, 5, 1)) + +groupby_last = Benchmark('data.groupby(labels).last()', setup, + start_date=datetime(2012, 5, 1)) + + +#---------------------------------------------------------------------- +# groupby_indices replacement, chop up Series + +setup = common_setup + """ +try: + rng = date_range('1/1/2000', '12/31/2005', freq='H') + year, month, day = rng.year, rng.month, rng.day +except: + rng = date_range('1/1/2000', '12/31/2000', offset=datetools.Hour()) + year = rng.map(lambda x: x.year) + month = rng.map(lambda x: x.month) + day = rng.map(lambda x: x.day) + +ts = Series(np.random.randn(len(rng)), index=rng) +""" + +groupby_indices = Benchmark('len(ts.groupby([year, month, day]))', + setup, start_date=datetime(2012, 1, 1)) diff --git a/vb_suite/index_object.py b/vb_suite/index_object.py index 3df763133da87..819a81a53db52 100644 --- a/vb_suite/index_object.py +++ b/vb_suite/index_object.py @@ -16,7 +16,20 @@ rng2 = rng[:-1] """ -index_datetime_intersection = Benchmark("rng.intersection(rng2)", setup, - name='index_datetime_intersection') -index_datetime_union = Benchmark("rng.union(rng2)", setup, - name='index_datetime_union') +index_datetime_intersection = Benchmark("rng.intersection(rng2)", setup) +index_datetime_union = Benchmark("rng.union(rng2)", setup) + +# integers +setup = common_setup + """ +N = 1000000 +options = np.arange(N) + +left = Index(options.take(np.random.permutation(N)[:N // 2])) +right = Index(options.take(np.random.permutation(N)[:N // 2])) +""" + +index_int64_union = Benchmark('left.union(right)', setup, + start_date=datetime(2011, 1, 1)) + +index_int64_intersection = Benchmark('left.intersection(right)', setup, + start_date=datetime(2011, 1, 1)) diff --git a/vb_suite/join_merge.py b/vb_suite/join_merge.py index 002761a00adf1..07fcfcb5ddc14 100644 --- a/vb_suite/join_merge.py +++ b/vb_suite/join_merge.py @@ -66,6 +66,12 @@ name='join_dataframe_index_multi', start_date=datetime(2011, 10, 20)) +#---------------------------------------------------------------------- +# Joins on integer keys + +join_dataframe_integer_key = Benchmark("merge(df, df2, on='key')", setup, + start_date=datetime(2011, 10, 20)) + #---------------------------------------------------------------------- # DataFrame joins on index @@ -144,3 +150,20 @@ def sample(values, k): concat_series_axis1 = Benchmark('concat(pieces, axis=1)', setup, start_date=datetime(2012, 2, 27)) + +#---------------------------------------------------------------------- +# Ordered merge + +setup = common_setup + """ +groups = np.array([rands(10) for _ in xrange(10)], dtype='O') + +left = DataFrame({'group': groups.repeat(5000), + 'key' : np.tile(np.arange(0, 10000, 2), 10), + 'lvalue': np.random.randn(50000)}) + +right = DataFrame({'key' : np.arange(10000), + 'rvalue' : np.random.randn(10000)}) + +""" + +stmt = "ordered_merge(left, right, on='key', left_by='group')" diff --git a/vb_suite/make.py b/vb_suite/make.py index 0b9dd64690e40..c97b9c924150c 100755 --- a/vb_suite/make.py +++ b/vb_suite/make.py @@ -30,11 +30,6 @@ def upload(): os.system('cd build/html; rsync -avz . pandas@pandas.pydata.org' ':/usr/share/nginx/pandas/pandas-docs/vbench/ -essh') -def uploadpdf(): - 'push a copy to the sf site' - os.system('cd build/latex; scp pandas.pdf wesmckinn,pandas@web.sf.net' - ':/home/groups/p/pa/pandas/htdocs/') - def clean(): if os.path.exists('build'): shutil.rmtree('build') @@ -48,29 +43,10 @@ def html(): 'source build/html'): raise SystemExit("Building HTML failed.") -def latex(): - check_build() - if sys.platform != 'win32': - # LaTeX format. - if os.system('sphinx-build -b latex -d build/doctrees ' - 'source build/latex'): - raise SystemExit("Building LaTeX failed.") - # Produce pdf. - - os.chdir('build/latex') - - # Call the makefile produced by sphinx... - if os.system('make'): - raise SystemExit("Rendering LaTeX failed.") - - os.chdir('../..') - else: - print 'latex build has not been tested on windows' - def check_build(): build_dirs = [ 'build', 'build/doctrees', 'build/html', - 'build/latex', 'build/plots', 'build/_static', + 'build/plots', 'build/_static', 'build/_templates'] for d in build_dirs: try: @@ -79,15 +55,85 @@ def check_build(): pass def all(): - # clean() + clean() html() +def auto_update(): + msg = '' + try: + clean() + html() + upload() + sendmail() + except (Exception, SystemExit), inst: + msg += str(inst) + '\n' + sendmail(msg) + +def sendmail(err_msg=None): + from_name, to_name = _get_config() + + if err_msg is None: + msgstr = 'Daily vbench uploaded successfully' + subject = "VB: daily update successful" + else: + msgstr = err_msg + subject = "VB: daily update failed" + + import smtplib + from email.MIMEText import MIMEText + msg = MIMEText(msgstr) + msg['Subject'] = subject + msg['From'] = from_name + msg['To'] = to_name + + server_str, port, login, pwd = _get_credentials() + server = smtplib.SMTP(server_str, port) + server.ehlo() + server.starttls() + server.ehlo() + + server.login(login, pwd) + try: + server.sendmail(from_name, to_name, msg.as_string()) + finally: + server.close() + +def _get_dir(): + import getpass + USERNAME = getpass.getuser() + if sys.platform == 'darwin': + HOME = '/Users/%s' % USERNAME + else: + HOME = '/home/%s' % USERNAME + + tmp_dir = '%s/tmp' % HOME + return tmp_dir + +def _get_credentials(): + tmp_dir = _get_dir() + cred = '%s/credentials' % tmp_dir + with open(cred, 'r') as fh: + server, port, un, domain = fh.read().split(',') + port = int(port) + login = un + '@' + domain + '.com' + + import base64 + with open('%s/cron_email_pwd' % tmp_dir, 'r') as fh: + pwd = base64.b64decode(fh.read()) + + return server, port, login, pwd + +def _get_config(): + tmp_dir = _get_dir() + with open('%s/config' % tmp_dir, 'r') as fh: + from_name, to_name = fh.read().split(',') + return from_name, to_name + funcd = { 'html' : html, - 'latex' : latex, 'clean' : clean, 'upload' : upload, - 'uploadpdf' : uploadpdf, + 'auto_update' : auto_update, 'all' : all, } diff --git a/vb_suite/miscellaneous.py b/vb_suite/miscellaneous.py index 8295d275f2dd6..eeeaf01a8b4af 100644 --- a/vb_suite/miscellaneous.py +++ b/vb_suite/miscellaneous.py @@ -20,3 +20,15 @@ def prop(self): misc_cache_readonly = Benchmark("obj.prop", setup, name="misc_cache_readonly", ncalls=2000000) +#---------------------------------------------------------------------- +# match + +setup = common_setup + """ +from pandas.util.testing import rands + +uniques = np.array([rands(10) for _ in xrange(1000)], dtype='O') +all = uniques.repeat(10) +""" + +match_strings = Benchmark("match(all, uniques)", setup, + start_date=datetime(2012, 5, 12)) diff --git a/vb_suite/parser.py b/vb_suite/parser.py index 7c2754ca7da07..946e1327578c0 100644 --- a/vb_suite/parser.py +++ b/vb_suite/parser.py @@ -50,3 +50,42 @@ setup, cleanup="os.remove('test.csv')", start_date=datetime(2012, 5, 7)) + +setup = common_setup + """ +from pandas import read_table +from cStringIO import StringIO +import os +N = 10000 +K = 8 +data = '''\ +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +''' +data = data * 2000 +""" +cmd = ("read_table(StringIO(data), sep=',', header=None, " + "parse_dates=[[1,2], [1,3]])") +sdate = datetime(2012, 5, 7) +read_table_multiple_date = Benchmark(cmd, setup, start_date=sdate) + +setup = common_setup + """ +from pandas import read_table +from cStringIO import StringIO +import os +N = 10000 +K = 8 +data = '''\ +KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +''' +data = data * 2000 +""" +cmd = "read_table(StringIO(data), sep=',', header=None, parse_dates=[1])" +sdate = datetime(2012, 5, 7) +read_table_multiple_date_baseline = Benchmark(cmd, setup, start_date=sdate) diff --git a/vb_suite/reindex.py b/vb_suite/reindex.py index 9c307131ae5ac..62b26724eff46 100644 --- a/vb_suite/reindex.py +++ b/vb_suite/reindex.py @@ -114,6 +114,7 @@ def backfill(): # pathological, but realistic setup = common_setup + """ +import pandas._tseries as lib N = 10000 K = 10 @@ -135,6 +136,32 @@ def backfill(): name='frame_drop_duplicates', start_date=datetime(2011, 11, 15)) +statement = "df.drop_duplicates(['key1', 'key2'], inplace=True)" +frame_drop_dup_inplace = Benchmark(statement, setup, + name='frame_drop_dup_inplace', + start_date=datetime(2012, 5, 16)) + +lib_fast_zip = Benchmark('lib.fast_zip(df.values.T)', setup, + name='lib_fast_zip', + start_date=datetime(2012, 1, 1)) + +setup = setup + """ +df.ix[:10000, :] = np.nan +""" +statement2 = "df.drop_duplicates(['key1', 'key2'])" +frame_drop_duplicates_na = Benchmark(statement2, setup, + name='frame_drop_duplicates_na', + start_date=datetime(2012, 5, 15)) + +lib_fast_zip_fillna = Benchmark('lib.fast_zip_fillna(df.values.T)', setup, + name='lib_fast_zip_fillna', + start_date=datetime(2012, 5, 15)) + +statement2 = "df.drop_duplicates(['key1', 'key2'], inplace=True)" +frame_drop_dup_na_inplace = Benchmark(statement2, setup, + name='frame_drop_dup_na_inplace', + start_date=datetime(2012, 5, 16)) + #---------------------------------------------------------------------- # fillna, many columns diff --git a/vb_suite/replace.py b/vb_suite/replace.py new file mode 100644 index 0000000000000..bc5397df2c66d --- /dev/null +++ b/vb_suite/replace.py @@ -0,0 +1,24 @@ +from vbench.api import Benchmark + +common_setup = """from pandas_vb_common import * +from datetime import timedelta +import pandas._tseries as lib +N = 1000000 + +try: + rng = date_range('1/1/2000', periods=N, freq='min') +except NameError: + rng = DateRange('1/1/2000', periods=N, offset=datetools.Minute()) + date_range = DateRange + +ts = Series(np.random.randn(N), index=rng) + +def replace_slow(ser, old, new): + lib.slow_replace(ser.values, old, new) + return ser +""" + +replace_fillna = Benchmark('ts.fillna(0., inplace=True)', common_setup) +replace_replacena = Benchmark('ts.replace(np.nan, 0., inplace=True)', + common_setup) +replace_putmask = Benchmark('replace_slow(ts, np.nan, 0.)', common_setup) diff --git a/vb_suite/sparse.py b/vb_suite/sparse.py index 3c068e743697c..18cd71fb45ff8 100644 --- a/vb_suite/sparse.py +++ b/vb_suite/sparse.py @@ -14,7 +14,7 @@ rng = np.asarray(DateRange('1/1/2000', periods=N, offset=datetools.Minute())) -# rng2 = np.asarray(rng).astype('M8[us]').astype('i8') +# rng2 = np.asarray(rng).astype('M8[ns]').astype('i8') series = {} for i in range(1, K + 1): diff --git a/vb_suite/timeseries.py b/vb_suite/timeseries.py index 1fccea71f4ba9..98efe7917d977 100644 --- a/vb_suite/timeseries.py +++ b/vb_suite/timeseries.py @@ -9,11 +9,24 @@ rng = date_range('1/1/2000', periods=N, freq='min') except NameError: rng = DateRange('1/1/2000', periods=N, offset=datetools.Minute()) - date_range = DateRange + def date_range(start=None, end=None, periods=None, freq=None): + return DateRange(start, end, periods=periods, offset=freq) ts = Series(np.random.randn(N), index=rng) """ +#---------------------------------------------------------------------- +# Lookup value in large time series, hash map population + +setup = common_setup + """ +rng = date_range('1/1/2000', periods=1500000, freq='s') +ts = Series(1, index=rng) +""" + +stmt = "ts[ts.index[len(ts) // 2]]; ts.index._cleanup()" +timeseries_large_lookup_value = Benchmark(stmt, setup, + start_date=datetime(2012, 1, 1)) + #---------------------------------------------------------------------- # Test slice minutely series