@@ -883,27 +883,66 @@ def dot(self, other):
883
883
@classmethod
884
884
def from_dict (cls , data , orient = 'columns' , dtype = None , columns = None ):
885
885
"""
886
- Construct DataFrame from dict of array-like or dicts
886
+ Construct DataFrame from dict of array-like or dicts.
887
+
888
+ Creates DataFrame object from dictionary by columns or by index
889
+ allowing dtype specification.
887
890
888
891
Parameters
889
892
----------
890
893
data : dict
891
- {field : array-like} or {field : dict}
894
+ Of the form {field : array-like} or {field : dict}.
892
895
orient : {'columns', 'index'}, default 'columns'
893
896
The "orientation" of the data. If the keys of the passed dict
894
897
should be the columns of the resulting DataFrame, pass 'columns'
895
898
(default). Otherwise if the keys should be rows, pass 'index'.
896
899
dtype : dtype, default None
897
- Data type to force, otherwise infer
898
- columns: list, default None
899
- Column labels to use when orient='index'. Raises a ValueError
900
- if used with orient='columns'
900
+ Data type to force, otherwise infer.
901
+ columns : list, default None
902
+ Column labels to use when `` orient='index'`` . Raises a ValueError
903
+ if used with `` orient='columns'``.
901
904
902
905
.. versionadded:: 0.23.0
903
906
904
907
Returns
905
908
-------
906
- DataFrame
909
+ pandas.DataFrame
910
+
911
+ See Also
912
+ --------
913
+ DataFrame.from_records : DataFrame from ndarray (structured
914
+ dtype), list of tuples, dict, or DataFrame
915
+ DataFrame : DataFrame object creation using constructor
916
+
917
+ Examples
918
+ --------
919
+ By default the keys of the dict become the DataFrame columns:
920
+
921
+ >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
922
+ >>> pd.DataFrame.from_dict(data)
923
+ col_1 col_2
924
+ 0 3 a
925
+ 1 2 b
926
+ 2 1 c
927
+ 3 0 d
928
+
929
+ Specify ``orient='index'`` to create the DataFrame using dictionary
930
+ keys as rows:
931
+
932
+ >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
933
+ >>> pd.DataFrame.from_dict(data, orient='index')
934
+ 0 1 2 3
935
+ row_1 3 2 1 0
936
+ row_2 a b c d
937
+
938
+ When using the 'index' orientation, the column names can be
939
+ specified manually:
940
+
941
+ >>> pd.DataFrame.from_dict(data, orient='index',
942
+ ... columns=['A', 'B', 'C', 'D'])
943
+ A B C D
944
+ row_1 3 2 1 0
945
+ row_2 a b c d
907
946
"""
908
947
index = None
909
948
orient = orient .lower ()
@@ -1209,20 +1248,68 @@ def from_records(cls, data, index=None, exclude=None, columns=None,
1209
1248
1210
1249
def to_records (self , index = True , convert_datetime64 = True ):
1211
1250
"""
1212
- Convert DataFrame to record array. Index will be put in the
1213
- 'index' field of the record array if requested
1251
+ Convert DataFrame to a NumPy record array.
1252
+
1253
+ Index will be put in the 'index' field of the record array if
1254
+ requested.
1214
1255
1215
1256
Parameters
1216
1257
----------
1217
1258
index : boolean, default True
1218
- Include index in resulting record array, stored in 'index' field
1259
+ Include index in resulting record array, stored in 'index' field.
1219
1260
convert_datetime64 : boolean, default True
1220
1261
Whether to convert the index to datetime.datetime if it is a
1221
- DatetimeIndex
1262
+ DatetimeIndex.
1222
1263
1223
1264
Returns
1224
1265
-------
1225
- y : recarray
1266
+ y : numpy.recarray
1267
+
1268
+ See Also
1269
+ --------
1270
+ DataFrame.from_records: convert structured or record ndarray
1271
+ to DataFrame.
1272
+ numpy.recarray: ndarray that allows field access using
1273
+ attributes, analogous to typed columns in a
1274
+ spreadsheet.
1275
+
1276
+ Examples
1277
+ --------
1278
+ >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},
1279
+ ... index=['a', 'b'])
1280
+ >>> df
1281
+ A B
1282
+ a 1 0.50
1283
+ b 2 0.75
1284
+ >>> df.to_records()
1285
+ rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
1286
+ dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
1287
+
1288
+ The index can be excluded from the record array:
1289
+
1290
+ >>> df.to_records(index=False)
1291
+ rec.array([(1, 0.5 ), (2, 0.75)],
1292
+ dtype=[('A', '<i8'), ('B', '<f8')])
1293
+
1294
+ By default, timestamps are converted to `datetime.datetime`:
1295
+
1296
+ >>> df.index = pd.date_range('2018-01-01 09:00', periods=2, freq='min')
1297
+ >>> df
1298
+ A B
1299
+ 2018-01-01 09:00:00 1 0.50
1300
+ 2018-01-01 09:01:00 2 0.75
1301
+ >>> df.to_records()
1302
+ rec.array([(datetime.datetime(2018, 1, 1, 9, 0), 1, 0.5 ),
1303
+ (datetime.datetime(2018, 1, 1, 9, 1), 2, 0.75)],
1304
+ dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
1305
+
1306
+ The timestamp conversion can be disabled so NumPy's datetime64
1307
+ data type is used instead:
1308
+
1309
+ >>> df.to_records(convert_datetime64=False)
1310
+ rec.array([('2018-01-01T09:00:00.000000000', 1, 0.5 ),
1311
+ ('2018-01-01T09:01:00.000000000', 2, 0.75)],
1312
+ dtype=[('index', '<M8[ns]'), ('A', '<i8'), ('B', '<f8')])
1226
1313
"""
1227
1314
if index :
1228
1315
if is_datetime64_any_dtype (self .index ) and convert_datetime64 :
@@ -4722,20 +4809,90 @@ def melt(self, id_vars=None, value_vars=None, var_name=None,
4722
4809
4723
4810
def diff (self , periods = 1 , axis = 0 ):
4724
4811
"""
4725
- 1st discrete difference of object
4812
+ First discrete difference of element.
4813
+
4814
+ Calculates the difference of a DataFrame element compared with another
4815
+ element in the DataFrame (default is the element in the same column
4816
+ of the previous row).
4726
4817
4727
4818
Parameters
4728
4819
----------
4729
4820
periods : int, default 1
4730
- Periods to shift for forming difference
4821
+ Periods to shift for calculating difference, accepts negative
4822
+ values.
4731
4823
axis : {0 or 'index', 1 or 'columns'}, default 0
4732
4824
Take difference over rows (0) or columns (1).
4733
4825
4734
- .. versionadded:: 0.16.1
4826
+ .. versionadded:: 0.16.1.
4735
4827
4736
4828
Returns
4737
4829
-------
4738
4830
diffed : DataFrame
4831
+
4832
+ See Also
4833
+ --------
4834
+ Series.diff: First discrete difference for a Series.
4835
+ DataFrame.pct_change: Percent change over given number of periods.
4836
+ DataFrame.shift: Shift index by desired number of periods with an
4837
+ optional time freq.
4838
+
4839
+ Examples
4840
+ --------
4841
+ Difference with previous row
4842
+
4843
+ >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
4844
+ ... 'b': [1, 1, 2, 3, 5, 8],
4845
+ ... 'c': [1, 4, 9, 16, 25, 36]})
4846
+ >>> df
4847
+ a b c
4848
+ 0 1 1 1
4849
+ 1 2 1 4
4850
+ 2 3 2 9
4851
+ 3 4 3 16
4852
+ 4 5 5 25
4853
+ 5 6 8 36
4854
+
4855
+ >>> df.diff()
4856
+ a b c
4857
+ 0 NaN NaN NaN
4858
+ 1 1.0 0.0 3.0
4859
+ 2 1.0 1.0 5.0
4860
+ 3 1.0 1.0 7.0
4861
+ 4 1.0 2.0 9.0
4862
+ 5 1.0 3.0 11.0
4863
+
4864
+ Difference with previous column
4865
+
4866
+ >>> df.diff(axis=1)
4867
+ a b c
4868
+ 0 NaN 0.0 0.0
4869
+ 1 NaN -1.0 3.0
4870
+ 2 NaN -1.0 7.0
4871
+ 3 NaN -1.0 13.0
4872
+ 4 NaN 0.0 20.0
4873
+ 5 NaN 2.0 28.0
4874
+
4875
+ Difference with 3rd previous row
4876
+
4877
+ >>> df.diff(periods=3)
4878
+ a b c
4879
+ 0 NaN NaN NaN
4880
+ 1 NaN NaN NaN
4881
+ 2 NaN NaN NaN
4882
+ 3 3.0 2.0 15.0
4883
+ 4 3.0 4.0 21.0
4884
+ 5 3.0 6.0 27.0
4885
+
4886
+ Difference with following row
4887
+
4888
+ >>> df.diff(periods=-1)
4889
+ a b c
4890
+ 0 -1.0 0.0 -3.0
4891
+ 1 -1.0 -1.0 -5.0
4892
+ 2 -1.0 -1.0 -7.0
4893
+ 3 -1.0 -2.0 -9.0
4894
+ 4 -1.0 -3.0 -11.0
4895
+ 5 NaN NaN NaN
4739
4896
"""
4740
4897
bm_axis = self ._get_block_manager_axis (axis )
4741
4898
new_data = self ._data .diff (n = periods , axis = bm_axis )
@@ -5501,7 +5658,22 @@ def corr(self, method='pearson', min_periods=1):
5501
5658
5502
5659
def cov (self , min_periods = None ):
5503
5660
"""
5504
- Compute pairwise covariance of columns, excluding NA/null values
5661
+ Compute pairwise covariance of columns, excluding NA/null values.
5662
+
5663
+ Compute the pairwise covariance among the series of a DataFrame.
5664
+ The returned data frame is the `covariance matrix
5665
+ <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns
5666
+ of the DataFrame.
5667
+
5668
+ Both NA and null values are automatically excluded from the
5669
+ calculation. (See the note below about bias from missing values.)
5670
+ A threshold can be set for the minimum number of
5671
+ observations for each value created. Comparisons with observations
5672
+ below this threshold will be returned as ``NaN``.
5673
+
5674
+ This method is generally used for the analysis of time series data to
5675
+ understand the relationship between different measures
5676
+ across time.
5505
5677
5506
5678
Parameters
5507
5679
----------
@@ -5511,12 +5683,71 @@ def cov(self, min_periods=None):
5511
5683
5512
5684
Returns
5513
5685
-------
5514
- y : DataFrame
5686
+ DataFrame
5687
+ The covariance matrix of the series of the DataFrame.
5688
+
5689
+ See Also
5690
+ --------
5691
+ pandas.Series.cov : compute covariance with another Series
5692
+ pandas.core.window.EWM.cov: expoential weighted sample covariance
5693
+ pandas.core.window.Expanding.cov : expanding sample covariance
5694
+ pandas.core.window.Rolling.cov : rolling sample covariance
5515
5695
5516
5696
Notes
5517
5697
-----
5518
- `y` contains the covariance matrix of the DataFrame's time series.
5519
- The covariance is normalized by N-1 (unbiased estimator).
5698
+ Returns the covariance matrix of the DataFrame's time series.
5699
+ The covariance is normalized by N-1.
5700
+
5701
+ For DataFrames that have Series that are missing data (assuming that
5702
+ data is `missing at random
5703
+ <https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__)
5704
+ the returned covariance matrix will be an unbiased estimate
5705
+ of the variance and covariance between the member Series.
5706
+
5707
+ However, for many applications this estimate may not be acceptable
5708
+ because the estimate covariance matrix is not guaranteed to be positive
5709
+ semi-definite. This could lead to estimate correlations having
5710
+ absolute values which are greater than one, and/or a non-invertible
5711
+ covariance matrix. See `Estimation of covariance matrices
5712
+ <http://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_
5713
+ matrices>`__ for more details.
5714
+
5715
+ Examples
5716
+ --------
5717
+ >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)],
5718
+ ... columns=['dogs', 'cats'])
5719
+ >>> df.cov()
5720
+ dogs cats
5721
+ dogs 0.666667 -1.000000
5722
+ cats -1.000000 1.666667
5723
+
5724
+ >>> np.random.seed(42)
5725
+ >>> df = pd.DataFrame(np.random.randn(1000, 5),
5726
+ ... columns=['a', 'b', 'c', 'd', 'e'])
5727
+ >>> df.cov()
5728
+ a b c d e
5729
+ a 0.998438 -0.020161 0.059277 -0.008943 0.014144
5730
+ b -0.020161 1.059352 -0.008543 -0.024738 0.009826
5731
+ c 0.059277 -0.008543 1.010670 -0.001486 -0.000271
5732
+ d -0.008943 -0.024738 -0.001486 0.921297 -0.013692
5733
+ e 0.014144 0.009826 -0.000271 -0.013692 0.977795
5734
+
5735
+ **Minimum number of periods**
5736
+
5737
+ This method also supports an optional ``min_periods`` keyword
5738
+ that specifies the required minimum number of non-NA observations for
5739
+ each column pair in order to have a valid result:
5740
+
5741
+ >>> np.random.seed(42)
5742
+ >>> df = pd.DataFrame(np.random.randn(20, 3),
5743
+ ... columns=['a', 'b', 'c'])
5744
+ >>> df.loc[df.index[:5], 'a'] = np.nan
5745
+ >>> df.loc[df.index[5:10], 'b'] = np.nan
5746
+ >>> df.cov(min_periods=12)
5747
+ a b c
5748
+ a 0.316741 NaN -0.150812
5749
+ b NaN 1.248003 0.191417
5750
+ c -0.150812 0.191417 0.895202
5520
5751
"""
5521
5752
numeric_df = self ._get_numeric_data ()
5522
5753
cols = numeric_df .columns
0 commit comments