@@ -678,6 +678,36 @@ def _simple_new(cls, sparse_array, sparse_index, dtype):
678
678
new ._dtype = dtype
679
679
return new
680
680
681
+ @classmethod
682
+ def from_spmatrix (cls , data ):
683
+ """
684
+ Create a SparseArray from a scipy.sparse matrix.
685
+
686
+ Parameters
687
+ ----------
688
+ data : scipy.sparse.sp_matrix
689
+ This should be a 2-D SciPy sparse where the size
690
+ of the second dimension is 1. In other words, a
691
+ sparse matrix with a single column.
692
+
693
+ Returns
694
+ -------
695
+ SparseArray.
696
+ """
697
+ assert data .ndim == 2
698
+
699
+ length , ncol = data .shape
700
+
701
+ assert ncol == 1
702
+
703
+ arr = data .data
704
+ idx , _ = data .nonzero ()
705
+ zero = np .array (0 , dtype = arr .dtype ).item ()
706
+ dtype = SparseDtype (arr .dtype , zero )
707
+ index = IntIndex (length , idx )
708
+
709
+ return cls ._simple_new (arr , index , dtype )
710
+
681
711
def __array__ (self , dtype = None , copy = True ):
682
712
fill_value = self .fill_value
683
713
@@ -1891,6 +1921,9 @@ def _make_index(length, indices, kind):
1891
1921
# ----------------------------------------------------------------------------
1892
1922
# Accessor
1893
1923
1924
+ _validation_msg = "Can only use the '.sparse' accessor with Sparse data."
1925
+
1926
+
1894
1927
@delegate_names (SparseArray , ['npoints' , 'density' , 'fill_value' ,
1895
1928
'sp_values' ],
1896
1929
typ = 'property' )
@@ -1900,15 +1933,13 @@ class SparseAccessor(PandasDelegate):
1900
1933
"""
1901
1934
1902
1935
def __init__ (self , data = None ):
1903
- self ._validate (data )
1904
1936
# Store the Series since we need that for to_coo
1905
1937
self ._parent = data
1938
+ self ._validate (data )
1906
1939
1907
- @staticmethod
1908
- def _validate (data ):
1940
+ def _validate (self , data ):
1909
1941
if not isinstance (data .dtype , SparseDtype ):
1910
- msg = "Can only use the '.sparse' accessor with Sparse data."
1911
- raise AttributeError (msg )
1942
+ raise AttributeError (_validation_msg )
1912
1943
1913
1944
def _delegate_property_get (self , name , * args , ** kwargs ):
1914
1945
return getattr (self ._parent .values , name )
@@ -2025,3 +2056,126 @@ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
2025
2056
column_levels ,
2026
2057
sort_labels = sort_labels )
2027
2058
return A , rows , columns
2059
+
2060
+ def to_dense (self ):
2061
+ from pandas import Series
2062
+ return Series (self ._parent .array .to_dense (),
2063
+ index = self ._parent .index ,
2064
+ name = self ._parent .name )
2065
+
2066
+
2067
+ class SparseFrameAccessor (PandasDelegate ):
2068
+
2069
+ def __init__ (self , data = None ):
2070
+ # Store the Series since we need that for to_coo
2071
+ self ._parent = data
2072
+ self ._validate (data )
2073
+
2074
+ def _validate (self , data ):
2075
+ dtypes = data .dtypes
2076
+ if not all (isinstance (t , SparseDtype ) for t in dtypes ):
2077
+ raise AttributeError (_validation_msg )
2078
+
2079
+ @classmethod
2080
+ def from_spmatrix (cls , data , index = None , columns = None ):
2081
+ """
2082
+ Create a new DataFrame from a scipy sparse matrix.
2083
+
2084
+ Parameters
2085
+ ----------
2086
+ data : scipy.sparse.spmatrix
2087
+ Must be convertible to csc format.
2088
+ index, columns : Index, optional
2089
+ Row and column labels to use for the resulting DataFrame.
2090
+ Defaults to a RangeIndex.
2091
+
2092
+ Returns
2093
+ -------
2094
+ DataFrame
2095
+
2096
+ Examples
2097
+ --------
2098
+ >>> import scipy.sparse
2099
+ >>> mat = scipy.sparse.eye(3)
2100
+ >>> pd.DataFrame.sparse.from_spmatrix(mat)
2101
+ 0 1 2
2102
+ 0 1.0 0.0 0.0
2103
+ 1 0.0 1.0 0.0
2104
+ 2 0.0 0.0 1.0
2105
+ """
2106
+ from pandas import DataFrame
2107
+
2108
+ data = data .tocsc ()
2109
+ index , columns = cls ._prep_index (data , index , columns )
2110
+ sparrays = [
2111
+ SparseArray .from_spmatrix (data [:, i ])
2112
+ for i in range (data .shape [1 ])
2113
+ ]
2114
+ data = dict (zip (columns , sparrays ))
2115
+ return DataFrame (data , index = index )
2116
+
2117
+ def to_dense (self ):
2118
+ """
2119
+ Convert to dense DataFrame
2120
+
2121
+ Returns
2122
+ -------
2123
+ df : DataFrame
2124
+ """
2125
+ from pandas import DataFrame
2126
+
2127
+ data = {k : v .array .to_dense ()
2128
+ for k , v in compat .iteritems (self ._parent )}
2129
+ return DataFrame (data ,
2130
+ index = self ._parent .index ,
2131
+ columns = self ._parent .columns )
2132
+
2133
+ def to_coo (self ):
2134
+ try :
2135
+ from scipy .sparse import coo_matrix
2136
+ except ImportError :
2137
+ raise ImportError ('Scipy is not installed' )
2138
+
2139
+ dtype = find_common_type (self ._parent .dtypes )
2140
+ if isinstance (dtype , SparseDtype ):
2141
+ dtype = dtype .subtype
2142
+
2143
+ cols , rows , datas = [], [], []
2144
+ for col , name in enumerate (self ._parent ):
2145
+ s = self ._parent [name ]
2146
+ row = s .array .sp_index .to_int_index ().indices
2147
+ cols .append (np .repeat (col , len (row )))
2148
+ rows .append (row )
2149
+ datas .append (s .array .sp_values .astype (dtype , copy = False ))
2150
+
2151
+ cols = np .concatenate (cols )
2152
+ rows = np .concatenate (rows )
2153
+ datas = np .concatenate (datas )
2154
+ return coo_matrix ((datas , (rows , cols )), shape = self ._parent .shape )
2155
+
2156
+ @property
2157
+ def density (self ):
2158
+ """
2159
+ Ratio of non-sparse points to total (dense) data points
2160
+ represented in the DataFrame.
2161
+ """
2162
+ return np .mean ([column .array .density
2163
+ for _ , column in self ._parent .iteritems ()])
2164
+
2165
+ @staticmethod
2166
+ def _prep_index (data , index , columns ):
2167
+ import pandas .core .indexes .base as ibase
2168
+
2169
+ N , K = data .shape
2170
+ if index is None :
2171
+ index = ibase .default_index (N )
2172
+ if columns is None :
2173
+ columns = ibase .default_index (K )
2174
+
2175
+ if len (columns ) != K :
2176
+ raise ValueError ('Column length mismatch: {columns} vs. {K}'
2177
+ .format (columns = len (columns ), K = K ))
2178
+ if len (index ) != N :
2179
+ raise ValueError ('Index length mismatch: {index} vs. {N}'
2180
+ .format (index = len (index ), N = N ))
2181
+ return index , columns
0 commit comments