Adjust to Pandas breaking change, bump Python version (#101)

vinceatbluelabs · web-flow · commit d368dc61ed82 · 2020-08-12T08:41:39.000-04:00
Pandas 1.1 has broken Record Mover's usage of the read_csv() function by adding error checking in cases where a certain argument would be unused. Details of the Pandas change: * pandas-dev/pandas#27394 * pandas-dev/pandas#31383 See Records Mover test failures here: * https://app.circleci.com/pipelines/github/bluelabsio/records-mover/1089/workflows/e62f1cf0-f8d0-4e22-9652-112df72b02b8/jobs/9439
diff --git a/.python-version b/.python-version
@@ -1 +1 @@
-records-mover-3.8.3
+records-mover-3.8.5
diff --git a/deps.sh b/deps.sh
@@ -7,7 +7,7 @@ export SLUGIFY_USES_TEXT_UNIDECODE=yes
 brew update && ( brew upgrade pyenv || true )
 pyenv rehash  # needed if pyenv is updated
 
-python_version=3.8.3
+python_version=3.8.5
 # zipimport.ZipImportError: can't decompress data; zlib not available:
 #    You may need `xcode-select --install` on OS X
 #    https://github.com/pyenv/pyenv/issues/451#issuecomment-151336786
diff --git a/metrics/coverage_high_water_mark b/metrics/coverage_high_water_mark
@@ -1 +1 @@
-93.7100
+93.7200
diff --git a/metrics/flake8_high_water_mark b/metrics/flake8_high_water_mark
@@ -1 +1 @@
-170
+169
diff --git a/records_mover/records/delimited/csv_streamer.py b/records_mover/records/delimited/csv_streamer.py
@@ -41,10 +41,21 @@ def stream_csv(filepath_or_buffer: Union[str, IO[bytes]],
         'header': header,
         'compression': pandas_compression_from_hint[compression_hint],
         'escapechar': hints.get('escape'),
-        'prefix': 'untitled_',
         'iterator': True,
         'engine': 'python'
     }
+    if header is None:
+        # Pandas only accepts the prefix argument (which makes for
+        # tidier column names when otherwise not provided) when the
+        # header is explicitly marked as missing, not when it's
+        # available or even when we ask Pandas to infer it.  Bummer,
+        # as this means that when Pandas infers that there's no
+        # header, the column names will end up different than folks
+        # explicitly tell records mover that there is no header.
+        #
+        # https://github.com/pandas-dev/pandas/issues/27394
+        # https://github.com/pandas-dev/pandas/pull/31383
+        kwargs['prefix'] = 'untitled_'
     if 'quoting' in hints:
         quoting = hints['quoting']
         kwargs['quoting'] = pandas_quoting_from_hint[quoting]
diff --git a/records_mover/records/mover.py b/records_mover/records/mover.py
@@ -17,7 +17,7 @@
 
 def move(records_source: RecordsSource,
          records_target: RecordsTarget,
-         processing_instructions: ProcessingInstructions=ProcessingInstructions()) -> MoveResult:
+         processing_instructions: ProcessingInstructions = ProcessingInstructions()) -> MoveResult:
     """Copy records from one location to another.  Applies a sequence of
     possible techniques to do this in an efficient way and respects
     the preferences set in records_source, records_target and
diff --git a/records_mover/records/pandas/read_csv_options.py b/records_mover/records/pandas/read_csv_options.py
@@ -161,7 +161,13 @@ def pandas_read_csv_options(records_format: DelimitedRecordsFormat,
     # Leaving this in case a future version of Pandas behaves
     # better.
     #
-    pandas_options['prefix'] = 'untitled_'
+    if pandas_options['header'] is None:
+        # Pandas only accepts the prefix argument when the
+        # header is marked as missing.
+        #
+        # https://github.com/pandas-dev/pandas/issues/27394
+        # https://github.com/pandas-dev/pandas/pull/31383
+        pandas_options['prefix'] = 'untitled_'
 
     #
     # mangle_dupe_cols : bool, default True
diff --git a/tests/unit/records/test_csv_streamer.py b/tests/unit/records/test_csv_streamer.py
@@ -24,7 +24,6 @@ def test_stream_csv_no_compression(self,
                                              engine='python',
                                              escapechar=None,
                                              header='infer',
-                                             prefix='untitled_',
                                              quoting=0,
                                              iterator=True,
                                              sep=mock_field_delimiter)
@@ -49,7 +48,6 @@ def test_stream_csv_gzip(self,
                                              engine='python',
                                              escapechar=None,
                                              header='infer',
-                                             prefix='untitled_',
                                              iterator=True,
                                              sep=mock_field_delimiter)
             mock_io.TextIOWrapper.assert_not_called()
@@ -72,7 +70,6 @@ def test_stream_filename(self,
                                              engine='python',
                                              escapechar=None,
                                              header='infer',
-                                             prefix='untitled_',
                                              iterator=True,
                                              sep=mock_field_delimiter)
             mock_io.TextIOWrapper.assert_not_called()
diff --git a/tests/unit/records/test_pandas_read_csv_options.py b/tests/unit/records/test_pandas_read_csv_options.py
@@ -118,7 +118,6 @@ def test_pandas_read_csv_options_csv(self):
             'engine': 'python',
             'error_bad_lines': True,
             'header': 0,
-            'prefix': 'untitled_',
             'quotechar': '"',
             'quoting': 0,
             'warn_bad_lines': True,

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-records-mover-3.8.3`
	`1`	`+records-mover-3.8.5`
Original file line number	Diff line number	Diff line change
`@@ -161,7 +161,13 @@ def pandas_read_csv_options(records_format: DelimitedRecordsFormat,`
`161`	`161`	`# Leaving this in case a future version of Pandas behaves`
`162`	`162`	`# better.`
`163`	`163`	`#`
`164`		`- pandas_options['prefix'] = 'untitled_'`
	`164`	`+ if pandas_options['header'] is None:`
	`165`	`+ # Pandas only accepts the prefix argument when the`
	`166`	`+ # header is marked as missing.`
	`167`	`+ #`
	`168`	`+ # https://github.com/pandas-dev/pandas/issues/27394`
	`169`	`+ # https://github.com/pandas-dev/pandas/pull/31383`
	`170`	`+ pandas_options['prefix'] = 'untitled_'`
`165`	`171`
`166`	`172`	`#`
`167`	`173`	`# mangle_dupe_cols : bool, default True`