TST: Fix gbq integration tests. gbq._Dataset.dataset() would not return full results

parthea · jreback · commit 86ca84d8ec79 · 2017-02-14T08:29:25.000-05:00
This PR resolves an issue where `gbq._Dataset.datasets()` would not return all datasets under a Google BigQuery project. If `'nextPageToken'` is populated, then another `datasets().list()` request should be sent with `'pageToken'` set to collect more results. In the past few days, additional datasets were added under the Google BigQuery project id used by pandas as part of the following github project : https://github.com/pydata/pandas-gbq . The addition of datasets caused many gbq unit tests to fail because in function `clean_gbq_environment()`, we check to see if the dataset exists using the incomplete results from `gbq._Dataset.datasets()` before we attempt to delete it. Author: Anthonios Partheniou <apartheniou@electricalengineer.ca> Closes #15381 from parthea/fix-broken-gbq-unit-tests and squashes the following commits: 61bc1e7 [Anthonios Partheniou] TST: Fix gbq tests. gbq.dataset()/gbq.tables would not return full results.
diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py
@@ -1056,21 +1056,32 @@ def datasets(self):
             List of datasets under the specific project
         """
 
-        try:
-            list_dataset_response = self.service.datasets().list(
-                projectId=self.project_id).execute().get('datasets', None)
+        dataset_list = []
+        next_page_token = None
+        first_query = True
 
-            if not list_dataset_response:
-                return []
+        while first_query or next_page_token:
+            first_query = False
 
-            dataset_list = list()
+            try:
+                list_dataset_response = self.service.datasets().list(
+                    projectId=self.project_id,
+                    pageToken=next_page_token).execute()
 
-            for row_num, raw_row in enumerate(list_dataset_response):
-                dataset_list.append(raw_row['datasetReference']['datasetId'])
+                dataset_response = list_dataset_response.get('datasets')
+                next_page_token = list_dataset_response.get('nextPageToken')
 
-            return dataset_list
-        except self.http_error as ex:
-            self.process_http_error(ex)
+                if not dataset_response:
+                    return dataset_list
+
+                for row_num, raw_row in enumerate(dataset_response):
+                    dataset_list.append(
+                        raw_row['datasetReference']['datasetId'])
+
+            except self.http_error as ex:
+                self.process_http_error(ex)
+
+        return dataset_list
 
     def create(self, dataset_id):
         """ Create a dataset in Google BigQuery
@@ -1140,19 +1151,29 @@ def tables(self, dataset_id):
             List of tables under the specific dataset
         """
 
-        try:
-            list_table_response = self.service.tables().list(
-                projectId=self.project_id,
-                datasetId=dataset_id).execute().get('tables', None)
+        table_list = []
+        next_page_token = None
+        first_query = True
 
-            if not list_table_response:
-                return []
+        while first_query or next_page_token:
+            first_query = False
 
-            table_list = list()
+            try:
+                list_table_response = self.service.tables().list(
+                    projectId=self.project_id,
+                    datasetId=dataset_id,
+                    pageToken=next_page_token).execute()
 
-            for row_num, raw_row in enumerate(list_table_response):
-                table_list.append(raw_row['tableReference']['tableId'])
+                table_response = list_table_response.get('tables')
+                next_page_token = list_table_response.get('nextPageToken')
 
-            return table_list
-        except self.http_error as ex:
-            self.process_http_error(ex)
+                if not table_response:
+                    return table_list
+
+                for row_num, raw_row in enumerate(table_response):
+                    table_list.append(raw_row['tableReference']['tableId'])
+
+            except self.http_error as ex:
+                self.process_http_error(ex)
+
+        return table_list
diff --git a/pandas/tests/io/test_gbq.py b/pandas/tests/io/test_gbq.py
@@ -253,7 +253,7 @@ def test_generate_bq_schema_deprecated():
         gbq.generate_bq_schema(df)
 
 
-@pytest.mark.xfail(run=False, reason="flaky tests")
+@pytest.mark.single
 class TestGBQConnectorIntegrationWithLocalUserAccountAuth(tm.TestCase):
 
     def setUp(self):
@@ -299,7 +299,7 @@ def test_get_application_default_credentials_returns_credentials(self):
         self.assertTrue(isinstance(credentials, GoogleCredentials))
 
 
-@pytest.mark.xfail(run=False, reason="flaky tests")
+@pytest.mark.single
 class TestGBQConnectorIntegrationWithServiceAccountKeyPath(tm.TestCase):
     def setUp(self):
         _setup_common()
@@ -331,7 +331,7 @@ def test_should_be_able_to_get_results_from_query(self):
         self.assertTrue(pages is not None)
 
 
-@pytest.mark.xfail(run=False, reason="flaky tests")
+@pytest.mark.single
 class TestGBQConnectorIntegrationWithServiceAccountKeyContents(tm.TestCase):
     def setUp(self):
         _setup_common()
@@ -449,7 +449,7 @@ def test_read_gbq_with_corrupted_private_key_json_should_fail(self):
                 private_key=re.sub('[a-z]', '9', _get_private_key_contents()))
 
 
-@pytest.mark.xfail(run=False, reason="flaky tests")
+@pytest.mark.single
 class TestReadGBQIntegration(tm.TestCase):
 
     @classmethod
@@ -503,7 +503,7 @@ def test_should_read_as_service_account_with_key_contents(self):
         tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']}))
 
 
-@pytest.mark.xfail(run=False, reason="flaky tests")
+@pytest.mark.single
 class TestReadGBQIntegrationWithServiceAccountKeyPath(tm.TestCase):
 
     @classmethod
@@ -906,7 +906,7 @@ def test_configuration_without_query(self):
                          configuration=config)
 
 
-@pytest.mark.xfail(run=False, reason="flaky tests")
+@pytest.mark.single
 class TestToGBQIntegrationWithServiceAccountKeyPath(tm.TestCase):
     # Changes to BigQuery table schema may take up to 2 minutes as of May 2015
     # As a workaround to this issue, each test should use a unique table name.
@@ -1219,7 +1219,7 @@ def test_dataset_does_not_exist(self):
             DATASET_ID + "_not_found"), 'Expected dataset not to exist')
 
 
-@pytest.mark.xfail(run=False, reason="flaky tests")
+@pytest.mark.single
 class TestToGBQIntegrationWithLocalUserAccountAuth(tm.TestCase):
     # Changes to BigQuery table schema may take up to 2 minutes as of May 2015
     # As a workaround to this issue, each test should use a unique table name.
@@ -1277,7 +1277,7 @@ def test_upload_data(self):
         self.assertEqual(result['num_rows'][0], test_size)
 
 
-@pytest.mark.xfail(run=False, reason="flaky tests")
+@pytest.mark.single
 class TestToGBQIntegrationWithServiceAccountKeyContents(tm.TestCase):
     # Changes to BigQuery table schema may take up to 2 minutes as of May 2015
     # As a workaround to this issue, each test should use a unique table name.