BUG: fix read_csv to parse timezone correctly (#22380)

swyoon · jreback · commit 45c0b5ffd4f7 · 2018-08-22T06:27:16.000-04:00
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -676,6 +676,7 @@ I/O
 
 - :func:`read_html()` no longer ignores all-whitespace ``<tr>`` within ``<thead>`` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`)
 - :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
+- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`)
 -
 
 Plotting
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1620,7 +1620,6 @@ def _infer_types(self, values, na_values, try_num_bool=True):
         converted : ndarray
         na_count : int
         """
-
         na_count = 0
         if issubclass(values.dtype.type, (np.number, np.bool_)):
             mask = algorithms.isin(values, list(na_values))
@@ -1633,20 +1632,22 @@ def _infer_types(self, values, na_values, try_num_bool=True):
 
         if try_num_bool:
             try:
-                result = lib.maybe_convert_numeric(values, na_values, False)
+                result = lib.maybe_convert_numeric(np.asarray(values),
+                                                   na_values, False)
                 na_count = isna(result).sum()
             except Exception:
                 result = values
                 if values.dtype == np.object_:
-                    na_count = parsers.sanitize_objects(result, na_values,
-                                                        False)
+                    na_count = parsers.sanitize_objects(np.asarray(result),
+                                                        na_values, False)
         else:
             result = values
             if values.dtype == np.object_:
-                na_count = parsers.sanitize_objects(values, na_values, False)
+                na_count = parsers.sanitize_objects(np.asarray(values),
+                                                    na_values, False)
 
         if result.dtype == np.object_ and try_num_bool:
-            result = libops.maybe_convert_bool(values,
+            result = libops.maybe_convert_bool(np.asarray(values),
                                                true_values=self.true_values,
                                                false_values=self.false_values)
 
@@ -3033,7 +3034,7 @@ def converter(*date_cols):
                 return tools.to_datetime(
                     ensure_object(strs),
                     utc=None,
-                    box=False,
+                    box=True,
                     dayfirst=dayfirst,
                     errors='ignore',
                     infer_datetime_format=infer_datetime_format
diff --git a/pandas/tests/io/parser/parse_dates.py b/pandas/tests/io/parser/parse_dates.py
@@ -13,9 +13,9 @@
 from pandas._libs.tslibs import parsing
 from pandas._libs.tslib import Timestamp
 
+import pytz
 import pandas as pd
 import pandas.io.parsers as parsers
-import pandas.core.tools.datetimes as tools
 import pandas.util.testing as tm
 
 import pandas.io.date_converters as conv
@@ -356,21 +356,13 @@ def test_parse_dates_custom_euroformat(self):
 
     def test_parse_tz_aware(self):
         # See gh-1693
-        import pytz
         data = StringIO("Date,x\n2012-06-13T01:39:00Z,0.5")
 
         # it works
         result = self.read_csv(data, index_col=0, parse_dates=True)
         stamp = result.index[0]
         assert stamp.minute == 39
-        try:
-            assert result.index.tz is pytz.utc
-        except AssertionError:
-            arr = result.index.to_pydatetime()
-            result = tools.to_datetime(arr, utc=True)[0]
-            assert stamp.minute == result.minute
-            assert stamp.hour == result.hour
-            assert stamp.day == result.day
+        assert result.index.tz is pytz.utc
 
     def test_multiple_date_cols_index(self):
         data = """
@@ -674,3 +666,19 @@ def test_parse_date_float(self, data, expected, parse_dates):
         # (i.e. float precision should remain unchanged).
         result = self.read_csv(StringIO(data), parse_dates=parse_dates)
         tm.assert_frame_equal(result, expected)
+
+    def test_parse_timezone(self):
+        # gh-22256
+        data = """dt,val
+                  2018-01-04 09:01:00+09:00,23350
+                  2018-01-04 09:02:00+09:00,23400
+                  2018-01-04 09:03:00+09:00,23400
+                  2018-01-04 09:04:00+09:00,23400
+                  2018-01-04 09:05:00+09:00,23400"""
+        parsed = self.read_csv(StringIO(data), parse_dates=['dt'])
+        dti = pd.DatetimeIndex(start='2018-01-04 09:01:00',
+                               end='2018-01-04 09:05:00', freq='1min',
+                               tz=pytz.FixedOffset(540))
+        expected_data = {'dt': dti, 'val': [23350, 23400, 23400, 23400, 23400]}
+        expected = DataFrame(expected_data)
+        tm.assert_frame_equal(parsed, expected)

Original file line number	Diff line number	Diff line change
`@@ -676,6 +676,7 @@ I/O`
`676`	`676`
`677`	`677`	- :func:`read_html()` no longer ignores all-whitespace ``<tr>`` within ``<thead>`` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`)
`678`	`678`	- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
	`679`	+- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`)
`679`	`680`	`-`
`680`	`681`
`681`	`682`	`Plotting`