@@ -689,6 +689,55 @@ def _simple_new(
689
689
new ._dtype = dtype
690
690
return new
691
691
692
+ @classmethod
693
+ def from_spmatrix (cls , data ):
694
+ """
695
+ Create a SparseArray from a scipy.sparse matrix.
696
+
697
+ .. versionadded:: 0.25.0
698
+
699
+ Parameters
700
+ ----------
701
+ data : scipy.sparse.sp_matrix
702
+ This should be a SciPy sparse matrix where the size
703
+ of the second dimension is 1. In other words, a
704
+ sparse matrix with a single column.
705
+
706
+ Returns
707
+ -------
708
+ SparseArray
709
+
710
+ Examples
711
+ --------
712
+ >>> import scipy.sparse
713
+ >>> mat = scipy.sparse.coo_matrix((4, 1))
714
+ >>> pd.SparseArray.from_spmatrix(mat)
715
+ [0.0, 0.0, 0.0, 0.0]
716
+ Fill: 0.0
717
+ IntIndex
718
+ Indices: array([], dtype=int32)
719
+ """
720
+ length , ncol = data .shape
721
+
722
+ if ncol != 1 :
723
+ raise ValueError (
724
+ "'data' must have a single column, not '{}'" .format (ncol )
725
+ )
726
+
727
+ # our sparse index classes require that the positions be strictly
728
+ # increasing. So we need to sort loc, and arr accordingly.
729
+ arr = data .data
730
+ idx , _ = data .nonzero ()
731
+ loc = np .argsort (idx )
732
+ arr = arr .take (loc )
733
+ idx .sort ()
734
+
735
+ zero = np .array (0 , dtype = arr .dtype ).item ()
736
+ dtype = SparseDtype (arr .dtype , zero )
737
+ index = IntIndex (length , idx )
738
+
739
+ return cls ._simple_new (arr , index , dtype )
740
+
692
741
def __array__ (self , dtype = None , copy = True ):
693
742
fill_value = self .fill_value
694
743
@@ -1900,27 +1949,32 @@ def _make_index(length, indices, kind):
1900
1949
# ----------------------------------------------------------------------------
1901
1950
# Accessor
1902
1951
1952
+
1953
+ class BaseAccessor (object ):
1954
+ _validation_msg = "Can only use the '.sparse' accessor with Sparse data."
1955
+
1956
+ def __init__ (self , data = None ):
1957
+ self ._parent = data
1958
+ self ._validate (data )
1959
+
1960
+ def _validate (self , data ):
1961
+ raise NotImplementedError
1962
+
1963
+
1903
1964
@delegate_names (SparseArray , ['npoints' , 'density' , 'fill_value' ,
1904
1965
'sp_values' ],
1905
1966
typ = 'property' )
1906
- class SparseAccessor (PandasDelegate ):
1967
+ class SparseAccessor (BaseAccessor , PandasDelegate ):
1907
1968
"""
1908
1969
Accessor for SparseSparse from other sparse matrix data types.
1909
1970
"""
1910
1971
1911
- def __init__ (self , data = None ):
1912
- self ._validate (data )
1913
- # Store the Series since we need that for to_coo
1914
- self ._parent = data
1915
-
1916
- @staticmethod
1917
- def _validate (data ):
1972
+ def _validate (self , data ):
1918
1973
if not isinstance (data .dtype , SparseDtype ):
1919
- msg = "Can only use the '.sparse' accessor with Sparse data."
1920
- raise AttributeError (msg )
1974
+ raise AttributeError (self ._validation_msg )
1921
1975
1922
1976
def _delegate_property_get (self , name , * args , ** kwargs ):
1923
- return getattr (self ._parent .values , name )
1977
+ return getattr (self ._parent .array , name )
1924
1978
1925
1979
def _delegate_method (self , name , * args , ** kwargs ):
1926
1980
if name == 'from_coo' :
@@ -2034,3 +2088,188 @@ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
2034
2088
column_levels ,
2035
2089
sort_labels = sort_labels )
2036
2090
return A , rows , columns
2091
+
2092
+ def to_dense (self ):
2093
+ """
2094
+ Convert a Series from sparse values to dense.
2095
+
2096
+ .. versionadded:: 0.25.0
2097
+
2098
+ Returns
2099
+ -------
2100
+ Series:
2101
+ A Series with the same values, stored as a dense array.
2102
+
2103
+ Examples
2104
+ --------
2105
+ >>> series = pd.Series(pd.SparseArray([0, 1, 0]))
2106
+ >>> series
2107
+ 0 0
2108
+ 1 1
2109
+ 2 0
2110
+ dtype: Sparse[int64, 0]
2111
+
2112
+ >>> series.sparse.to_dense()
2113
+ 0 0
2114
+ 1 1
2115
+ 2 0
2116
+ dtype: int64
2117
+ """
2118
+ from pandas import Series
2119
+ return Series (self ._parent .array .to_dense (),
2120
+ index = self ._parent .index ,
2121
+ name = self ._parent .name )
2122
+
2123
+
2124
+ class SparseFrameAccessor (BaseAccessor , PandasDelegate ):
2125
+ """
2126
+ DataFrame accessor for sparse data.
2127
+
2128
+ .. versionadded :: 0.25.0
2129
+ """
2130
+
2131
+ def _validate (self , data ):
2132
+ dtypes = data .dtypes
2133
+ if not all (isinstance (t , SparseDtype ) for t in dtypes ):
2134
+ raise AttributeError (self ._validation_msg )
2135
+
2136
+ @classmethod
2137
+ def from_spmatrix (cls , data , index = None , columns = None ):
2138
+ """
2139
+ Create a new DataFrame from a scipy sparse matrix.
2140
+
2141
+ .. versionadded:: 0.25.0
2142
+
2143
+ Parameters
2144
+ ----------
2145
+ data : scipy.sparse.spmatrix
2146
+ Must be convertible to csc format.
2147
+ index, columns : Index, optional
2148
+ Row and column labels to use for the resulting DataFrame.
2149
+ Defaults to a RangeIndex.
2150
+
2151
+ Returns
2152
+ -------
2153
+ DataFrame
2154
+ Each column of the DataFrame is stored as a
2155
+ :class:`SparseArray`.
2156
+
2157
+ Examples
2158
+ --------
2159
+ >>> import scipy.sparse
2160
+ >>> mat = scipy.sparse.eye(3)
2161
+ >>> pd.DataFrame.sparse.from_spmatrix(mat)
2162
+ 0 1 2
2163
+ 0 1.0 0.0 0.0
2164
+ 1 0.0 1.0 0.0
2165
+ 2 0.0 0.0 1.0
2166
+ """
2167
+ from pandas import DataFrame
2168
+
2169
+ data = data .tocsc ()
2170
+ index , columns = cls ._prep_index (data , index , columns )
2171
+ sparrays = [
2172
+ SparseArray .from_spmatrix (data [:, i ])
2173
+ for i in range (data .shape [1 ])
2174
+ ]
2175
+ data = dict (zip (columns , sparrays ))
2176
+ return DataFrame (data , index = index )
2177
+
2178
+ def to_dense (self ):
2179
+ """
2180
+ Convert a DataFrame with sparse values to dense.
2181
+
2182
+ .. versionadded:: 0.25.0
2183
+
2184
+ Returns
2185
+ -------
2186
+ DataFrame
2187
+ A DataFrame with the same values stored as dense arrays.
2188
+
2189
+ Examples
2190
+ --------
2191
+ >>> df = pd.DataFrame({"A": pd.SparseArray([0, 1, 0])})
2192
+ >>> df.sparse.to_dense()
2193
+ A
2194
+ 0 0
2195
+ 1 1
2196
+ 2 0
2197
+ """
2198
+ from pandas import DataFrame
2199
+
2200
+ data = {k : v .array .to_dense ()
2201
+ for k , v in compat .iteritems (self ._parent )}
2202
+ return DataFrame (data ,
2203
+ index = self ._parent .index ,
2204
+ columns = self ._parent .columns )
2205
+
2206
+ def to_coo (self ):
2207
+ """
2208
+ Return the contents of the frame as a sparse SciPy COO matrix.
2209
+
2210
+ .. versionadded:: 0.20.0
2211
+
2212
+ Returns
2213
+ -------
2214
+ coo_matrix : scipy.sparse.spmatrix
2215
+ If the caller is heterogeneous and contains booleans or objects,
2216
+ the result will be of dtype=object. See Notes.
2217
+
2218
+ Notes
2219
+ -----
2220
+ The dtype will be the lowest-common-denominator type (implicit
2221
+ upcasting); that is to say if the dtypes (even of numeric types)
2222
+ are mixed, the one that accommodates all will be chosen.
2223
+
2224
+ e.g. If the dtypes are float16 and float32, dtype will be upcast to
2225
+ float32. By numpy.find_common_type convention, mixing int64 and
2226
+ and uint64 will result in a float64 dtype.
2227
+ """
2228
+ try :
2229
+ from scipy .sparse import coo_matrix
2230
+ except ImportError :
2231
+ raise ImportError ('Scipy is not installed' )
2232
+
2233
+ dtype = find_common_type (self ._parent .dtypes )
2234
+ if isinstance (dtype , SparseDtype ):
2235
+ dtype = dtype .subtype
2236
+
2237
+ cols , rows , datas = [], [], []
2238
+ for col , name in enumerate (self ._parent ):
2239
+ s = self ._parent [name ]
2240
+ row = s .array .sp_index .to_int_index ().indices
2241
+ cols .append (np .repeat (col , len (row )))
2242
+ rows .append (row )
2243
+ datas .append (s .array .sp_values .astype (dtype , copy = False ))
2244
+
2245
+ cols = np .concatenate (cols )
2246
+ rows = np .concatenate (rows )
2247
+ datas = np .concatenate (datas )
2248
+ return coo_matrix ((datas , (rows , cols )), shape = self ._parent .shape )
2249
+
2250
+ @property
2251
+ def density (self ):
2252
+ """
2253
+ Ratio of non-sparse points to total (dense) data points
2254
+ represented in the DataFrame.
2255
+ """
2256
+ return np .mean ([column .array .density
2257
+ for _ , column in self ._parent .iteritems ()])
2258
+
2259
+ @staticmethod
2260
+ def _prep_index (data , index , columns ):
2261
+ import pandas .core .indexes .base as ibase
2262
+
2263
+ N , K = data .shape
2264
+ if index is None :
2265
+ index = ibase .default_index (N )
2266
+ if columns is None :
2267
+ columns = ibase .default_index (K )
2268
+
2269
+ if len (columns ) != K :
2270
+ raise ValueError ('Column length mismatch: {columns} vs. {K}'
2271
+ .format (columns = len (columns ), K = K ))
2272
+ if len (index ) != N :
2273
+ raise ValueError ('Index length mismatch: {index} vs. {N}'
2274
+ .format (index = len (index ), N = N ))
2275
+ return index , columns
0 commit comments