DOC: read_excel doc - fixed formatting and added examples (pandas-dev#18753)

JanLauGe · hexgnu · commit 21b154cdb368 · 2018-01-01T08:46:11.000-08:00
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -205,6 +205,8 @@ Other API Changes
 - Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:issue:`16672`)
 - :func:`pandas.merge` now raises a ``ValueError`` when trying to merge on incompatible data types (:issue:`9780`)
 - :func:`wide_to_long` previously kept numeric-like suffixes as ``object`` dtype. Now they are cast to numeric if possible (:issue:`17627`)
+- In :func:`read_excel`, the ``comment`` argument is now exposed as a named parameter (:issue:`18735`)
+- Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:issue:`16672`)
 
 .. _whatsnew_0230.deprecations:
 
diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -137,7 +137,7 @@
 na_values : scalar, str, list-like, or dict, default None
     Additional strings to recognize as NA/NaN. If dict passed, specific
     per-column NA values. By default the following values are interpreted
-    as NaN: '""" + fill("', '".join(sorted(_NA_VALUES)), 70) + """'.
+    as NaN: '""" + fill("', '".join(sorted(_NA_VALUES)), 70, subsequent_indent="    ") + """'.
 keep_default_na : bool, default True
     If na_values are specified and keep_default_na is False the default NaN
     values are overridden, otherwise they're appended to.
@@ -148,6 +148,10 @@
     this parameter is only necessary for columns stored as TEXT in Excel,
     any numeric columns will automatically be parsed, regardless of display
     format.
+comment : str, default None
+    Comments out remainder of line. Pass a character or characters to this
+    argument to indicate comments in the input file. Any data between the
+    comment string and the end of the current line is ignored.
 skip_footer : int, default 0
 
     .. deprecated:: 0.23.0
@@ -164,6 +168,77 @@
 parsed : DataFrame or Dict of DataFrames
     DataFrame from the passed in Excel file.  See notes in sheet_name
     argument for more information on when a Dict of Dataframes is returned.
+
+Examples
+--------
+
+An example DataFrame written to a local file
+
+>>> df_out = pd.DataFrame([('string1', 1),
+...                        ('string2', 2),
+...                        ('string3', 3)],
+...                       columns=['Name', 'Value'])
+>>> df_out
+      Name  Value
+0  string1      1
+1  string2      2
+2  string3      3
+>>> df_out.to_excel('tmp.xlsx')
+
+The file can be read using the file name as string or an open file object:
+
+>>> pd.read_excel('tmp.xlsx')
+      Name  Value
+0  string1      1
+1  string2      2
+2  string3      3
+
+>>> pd.read_excel(open('tmp.xlsx','rb'))
+      Name  Value
+0  string1      1
+1  string2      2
+2  string3      3
+
+Index and header can be specified via the `index_col` and `header` arguments
+
+>>> pd.read_excel('tmp.xlsx', index_col=None, header=None)
+     0        1      2
+0  NaN     Name  Value
+1  0.0  string1      1
+2  1.0  string2      2
+3  2.0  string3      3
+
+Column types are inferred but can be explicitly specified
+
+>>> pd.read_excel('tmp.xlsx', dtype={'Name':str, 'Value':float})
+      Name  Value
+0  string1    1.0
+1  string2    2.0
+2  string3    3.0
+
+True, False, and NA values, and thousands separators have defaults,
+but can be explicitly specified, too. Supply the values you would like
+as strings or lists of strings!
+
+>>> pd.read_excel('tmp.xlsx',
+...               na_values=['string1', 'string2'])
+      Name  Value
+0      NaN      1
+1      NaN      2
+2  string3      3
+
+Comment lines in the excel input file can be skipped using the `comment` kwarg
+
+>>> df = pd.DataFrame({'a': ['1', '#2'], 'b': ['2', '3']})
+>>> df.to_excel('tmp.xlsx', index=False)
+>>> pd.read_excel('tmp.xlsx')
+    a  b
+0   1  2
+1  #2  3
+
+>>> pd.read_excel('tmp.xlsx', comment='#')
+   a  b
+0  1  2
 """
 
 
@@ -223,6 +298,7 @@ def read_excel(io,
                parse_dates=False,
                date_parser=None,
                thousands=None,
+               comment=None,
                skipfooter=0,
                convert_float=True,
                **kwds):
@@ -256,6 +332,7 @@ def read_excel(io,
         parse_dates=parse_dates,
         date_parser=date_parser,
         thousands=thousands,
+        comment=comment,
         skipfooter=skipfooter,
         convert_float=convert_float,
         **kwds)
@@ -338,6 +415,7 @@ def parse(self,
               parse_dates=False,
               date_parser=None,
               thousands=None,
+              comment=None,
               skipfooter=0,
               convert_float=True,
               **kwds):
@@ -363,6 +441,7 @@ def parse(self,
                                  parse_dates=parse_dates,
                                  date_parser=date_parser,
                                  thousands=thousands,
+                                 comment=comment,
                                  skipfooter=skipfooter,
                                  convert_float=convert_float,
                                  **kwds)
@@ -417,6 +496,7 @@ def _parse_excel(self,
                      parse_dates=False,
                      date_parser=None,
                      thousands=None,
+                     comment=None,
                      skipfooter=0,
                      convert_float=True,
                      **kwds):
@@ -591,6 +671,7 @@ def _parse_cell(cell_contents, cell_typ):
                                     parse_dates=parse_dates,
                                     date_parser=date_parser,
                                     thousands=thousands,
+                                    comment=comment,
                                     skipfooter=skipfooter,
                                     **kwds)
 
diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py
@@ -1858,6 +1858,68 @@ def test_invalid_columns(self):
             with pytest.raises(KeyError):
                 write_frame.to_excel(path, 'test1', columns=['C', 'D'])
 
+    def test_comment_arg(self):
+        # Re issue #18735
+        # Test the comment argument functionality to read_excel
+        with ensure_clean(self.ext) as path:
+
+            # Create file to read in
+            df = DataFrame({'A': ['one', '#one', 'one'],
+                            'B': ['two', 'two', '#two']})
+            df.to_excel(path, 'test_c')
+
+            # Read file without comment arg
+            result1 = read_excel(path, 'test_c')
+            result1.iloc[1, 0] = None
+            result1.iloc[1, 1] = None
+            result1.iloc[2, 1] = None
+            result2 = read_excel(path, 'test_c', comment='#')
+            tm.assert_frame_equal(result1, result2)
+
+    def test_comment_default(self):
+        # Re issue #18735
+        # Test the comment argument default to read_excel
+        with ensure_clean(self.ext) as path:
+
+            # Create file to read in
+            df = DataFrame({'A': ['one', '#one', 'one'],
+                            'B': ['two', 'two', '#two']})
+            df.to_excel(path, 'test_c')
+
+            # Read file with default and explicit comment=None
+            result1 = read_excel(path, 'test_c')
+            result2 = read_excel(path, 'test_c', comment=None)
+            tm.assert_frame_equal(result1, result2)
+
+    def test_comment_used(self):
+        # Re issue #18735
+        # Test the comment argument is working as expected when used
+        with ensure_clean(self.ext) as path:
+
+            # Create file to read in
+            df = DataFrame({'A': ['one', '#one', 'one'],
+                            'B': ['two', 'two', '#two']})
+            df.to_excel(path, 'test_c')
+
+            # Test read_frame_comment against manually produced expected output
+            expected = DataFrame({'A': ['one', None, 'one'],
+                                  'B': ['two', None, None]})
+            result = read_excel(path, 'test_c', comment='#')
+            tm.assert_frame_equal(result, expected)
+
+    def test_comment_emptyline(self):
+        # Re issue #18735
+        # Test that read_excel ignores commented lines at the end of file
+        with ensure_clean(self.ext) as path:
+
+            df = DataFrame({'a': ['1', '#2'], 'b': ['2', '3']})
+            df.to_excel(path, index=False)
+
+            # Test that all-comment lines at EoF are ignored
+            expected = DataFrame({'a': [1], 'b': [2]})
+            result = read_excel(path, comment='#')
+            tm.assert_frame_equal(result, expected)
+
     def test_datetimes(self):
 
         # Test writing and reading datetimes. For issue #9139. (xref #9185)