@@ -688,6 +688,55 @@ def _simple_new(
688
688
new ._dtype = dtype
689
689
return new
690
690
691
+ @classmethod
692
+ def from_spmatrix (cls , data ):
693
+ """
694
+ Create a SparseArray from a scipy.sparse matrix.
695
+
696
+ .. versionadded:: 0.25.0
697
+
698
+ Parameters
699
+ ----------
700
+ data : scipy.sparse.sp_matrix
701
+ This should be a SciPy sparse matrix where the size
702
+ of the second dimension is 1. In other words, a
703
+ sparse matrix with a single column.
704
+
705
+ Returns
706
+ -------
707
+ SparseArray
708
+
709
+ Examples
710
+ --------
711
+ >>> import scipy.sparse
712
+ >>> mat = scipy.sparse.coo_matrix((4, 1))
713
+ >>> pd.SparseArray.from_spmatrix(mat)
714
+ [0.0, 0.0, 0.0, 0.0]
715
+ Fill: 0.0
716
+ IntIndex
717
+ Indices: array([], dtype=int32)
718
+ """
719
+ length , ncol = data .shape
720
+
721
+ if ncol != 1 :
722
+ raise ValueError (
723
+ "'data' must have a single column, not '{}'" .format (ncol )
724
+ )
725
+
726
+ # our sparse index classes require that the positions be strictly
727
+ # increasing. So we need to sort loc, and arr accordingly.
728
+ arr = data .data
729
+ idx , _ = data .nonzero ()
730
+ loc = np .argsort (idx )
731
+ arr = arr .take (loc )
732
+ idx .sort ()
733
+
734
+ zero = np .array (0 , dtype = arr .dtype ).item ()
735
+ dtype = SparseDtype (arr .dtype , zero )
736
+ index = IntIndex (length , idx )
737
+
738
+ return cls ._simple_new (arr , index , dtype )
739
+
691
740
def __array__ (self , dtype = None , copy = True ):
692
741
fill_value = self .fill_value
693
742
@@ -1899,27 +1948,32 @@ def _make_index(length, indices, kind):
1899
1948
# ----------------------------------------------------------------------------
1900
1949
# Accessor
1901
1950
1951
+
1952
+ class BaseAccessor (object ):
1953
+ _validation_msg = "Can only use the '.sparse' accessor with Sparse data."
1954
+
1955
+ def __init__ (self , data = None ):
1956
+ self ._parent = data
1957
+ self ._validate (data )
1958
+
1959
+ def _validate (self , data ):
1960
+ raise NotImplementedError
1961
+
1962
+
1902
1963
@delegate_names (SparseArray , ['npoints' , 'density' , 'fill_value' ,
1903
1964
'sp_values' ],
1904
1965
typ = 'property' )
1905
- class SparseAccessor (PandasDelegate ):
1966
+ class SparseAccessor (BaseAccessor , PandasDelegate ):
1906
1967
"""
1907
1968
Accessor for SparseSparse from other sparse matrix data types.
1908
1969
"""
1909
1970
1910
- def __init__ (self , data = None ):
1911
- self ._validate (data )
1912
- # Store the Series since we need that for to_coo
1913
- self ._parent = data
1914
-
1915
- @staticmethod
1916
- def _validate (data ):
1971
+ def _validate (self , data ):
1917
1972
if not isinstance (data .dtype , SparseDtype ):
1918
- msg = "Can only use the '.sparse' accessor with Sparse data."
1919
- raise AttributeError (msg )
1973
+ raise AttributeError (self ._validation_msg )
1920
1974
1921
1975
def _delegate_property_get (self , name , * args , ** kwargs ):
1922
- return getattr (self ._parent .values , name )
1976
+ return getattr (self ._parent .array , name )
1923
1977
1924
1978
def _delegate_method (self , name , * args , ** kwargs ):
1925
1979
if name == 'from_coo' :
@@ -2033,3 +2087,188 @@ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
2033
2087
column_levels ,
2034
2088
sort_labels = sort_labels )
2035
2089
return A , rows , columns
2090
+
2091
+ def to_dense (self ):
2092
+ """
2093
+ Convert a Series from sparse values to dense.
2094
+
2095
+ .. versionadded:: 0.25.0
2096
+
2097
+ Returns
2098
+ -------
2099
+ Series:
2100
+ A Series with the same values, stored as a dense array.
2101
+
2102
+ Examples
2103
+ --------
2104
+ >>> series = pd.Series(pd.SparseArray([0, 1, 0]))
2105
+ >>> series
2106
+ 0 0
2107
+ 1 1
2108
+ 2 0
2109
+ dtype: Sparse[int64, 0]
2110
+
2111
+ >>> series.sparse.to_dense()
2112
+ 0 0
2113
+ 1 1
2114
+ 2 0
2115
+ dtype: int64
2116
+ """
2117
+ from pandas import Series
2118
+ return Series (self ._parent .array .to_dense (),
2119
+ index = self ._parent .index ,
2120
+ name = self ._parent .name )
2121
+
2122
+
2123
+ class SparseFrameAccessor (BaseAccessor , PandasDelegate ):
2124
+ """
2125
+ DataFrame accessor for sparse data.
2126
+
2127
+ .. versionadded :: 0.25.0
2128
+ """
2129
+
2130
+ def _validate (self , data ):
2131
+ dtypes = data .dtypes
2132
+ if not all (isinstance (t , SparseDtype ) for t in dtypes ):
2133
+ raise AttributeError (self ._validation_msg )
2134
+
2135
+ @classmethod
2136
+ def from_spmatrix (cls , data , index = None , columns = None ):
2137
+ """
2138
+ Create a new DataFrame from a scipy sparse matrix.
2139
+
2140
+ .. versionadded:: 0.25.0
2141
+
2142
+ Parameters
2143
+ ----------
2144
+ data : scipy.sparse.spmatrix
2145
+ Must be convertible to csc format.
2146
+ index, columns : Index, optional
2147
+ Row and column labels to use for the resulting DataFrame.
2148
+ Defaults to a RangeIndex.
2149
+
2150
+ Returns
2151
+ -------
2152
+ DataFrame
2153
+ Each column of the DataFrame is stored as a
2154
+ :class:`SparseArray`.
2155
+
2156
+ Examples
2157
+ --------
2158
+ >>> import scipy.sparse
2159
+ >>> mat = scipy.sparse.eye(3)
2160
+ >>> pd.DataFrame.sparse.from_spmatrix(mat)
2161
+ 0 1 2
2162
+ 0 1.0 0.0 0.0
2163
+ 1 0.0 1.0 0.0
2164
+ 2 0.0 0.0 1.0
2165
+ """
2166
+ from pandas import DataFrame
2167
+
2168
+ data = data .tocsc ()
2169
+ index , columns = cls ._prep_index (data , index , columns )
2170
+ sparrays = [
2171
+ SparseArray .from_spmatrix (data [:, i ])
2172
+ for i in range (data .shape [1 ])
2173
+ ]
2174
+ data = dict (zip (columns , sparrays ))
2175
+ return DataFrame (data , index = index )
2176
+
2177
+ def to_dense (self ):
2178
+ """
2179
+ Convert a DataFrame with sparse values to dense.
2180
+
2181
+ .. versionadded:: 0.25.0
2182
+
2183
+ Returns
2184
+ -------
2185
+ DataFrame
2186
+ A DataFrame with the same values stored as dense arrays.
2187
+
2188
+ Examples
2189
+ --------
2190
+ >>> df = pd.DataFrame({"A": pd.SparseArray([0, 1, 0])})
2191
+ >>> df.sparse.to_dense()
2192
+ A
2193
+ 0 0
2194
+ 1 1
2195
+ 2 0
2196
+ """
2197
+ from pandas import DataFrame
2198
+
2199
+ data = {k : v .array .to_dense ()
2200
+ for k , v in compat .iteritems (self ._parent )}
2201
+ return DataFrame (data ,
2202
+ index = self ._parent .index ,
2203
+ columns = self ._parent .columns )
2204
+
2205
+ def to_coo (self ):
2206
+ """
2207
+ Return the contents of the frame as a sparse SciPy COO matrix.
2208
+
2209
+ .. versionadded:: 0.20.0
2210
+
2211
+ Returns
2212
+ -------
2213
+ coo_matrix : scipy.sparse.spmatrix
2214
+ If the caller is heterogeneous and contains booleans or objects,
2215
+ the result will be of dtype=object. See Notes.
2216
+
2217
+ Notes
2218
+ -----
2219
+ The dtype will be the lowest-common-denominator type (implicit
2220
+ upcasting); that is to say if the dtypes (even of numeric types)
2221
+ are mixed, the one that accommodates all will be chosen.
2222
+
2223
+ e.g. If the dtypes are float16 and float32, dtype will be upcast to
2224
+ float32. By numpy.find_common_type convention, mixing int64 and
2225
+ and uint64 will result in a float64 dtype.
2226
+ """
2227
+ try :
2228
+ from scipy .sparse import coo_matrix
2229
+ except ImportError :
2230
+ raise ImportError ('Scipy is not installed' )
2231
+
2232
+ dtype = find_common_type (self ._parent .dtypes )
2233
+ if isinstance (dtype , SparseDtype ):
2234
+ dtype = dtype .subtype
2235
+
2236
+ cols , rows , datas = [], [], []
2237
+ for col , name in enumerate (self ._parent ):
2238
+ s = self ._parent [name ]
2239
+ row = s .array .sp_index .to_int_index ().indices
2240
+ cols .append (np .repeat (col , len (row )))
2241
+ rows .append (row )
2242
+ datas .append (s .array .sp_values .astype (dtype , copy = False ))
2243
+
2244
+ cols = np .concatenate (cols )
2245
+ rows = np .concatenate (rows )
2246
+ datas = np .concatenate (datas )
2247
+ return coo_matrix ((datas , (rows , cols )), shape = self ._parent .shape )
2248
+
2249
+ @property
2250
+ def density (self ):
2251
+ """
2252
+ Ratio of non-sparse points to total (dense) data points
2253
+ represented in the DataFrame.
2254
+ """
2255
+ return np .mean ([column .array .density
2256
+ for _ , column in self ._parent .iteritems ()])
2257
+
2258
+ @staticmethod
2259
+ def _prep_index (data , index , columns ):
2260
+ import pandas .core .indexes .base as ibase
2261
+
2262
+ N , K = data .shape
2263
+ if index is None :
2264
+ index = ibase .default_index (N )
2265
+ if columns is None :
2266
+ columns = ibase .default_index (K )
2267
+
2268
+ if len (columns ) != K :
2269
+ raise ValueError ('Column length mismatch: {columns} vs. {K}'
2270
+ .format (columns = len (columns ), K = K ))
2271
+ if len (index ) != N :
2272
+ raise ValueError ('Index length mismatch: {index} vs. {N}'
2273
+ .format (index = len (index ), N = N ))
2274
+ return index , columns
0 commit comments