feature: Estimator.fit like logs for transformer

imujjwal96 · imujjwal96 · commit 25bf803a5b70 · 2019-05-07T23:08:27.000+05:30
diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py
@@ -1225,6 +1225,116 @@ def logs_for_job(self, job_name, wait=False, poll=10):  # noqa: C901 - suppress
             billable_time = (description['TrainingEndTime'] - description['TrainingStartTime']) * instance_count
             print('Billable seconds:', int(billable_time.total_seconds()) + 1)
 
+    def logs_for_transform_job(self, job_name, wait=False, poll=10):
+        """Display the logs for a given transform job, optionally tailing them until the
+        job is complete. If the output is a tty or a Jupyter cell, it will be color-coded
+        based on which instance the log entry is from.
+        Args:
+            job_name (str): Name of the transform job to display the logs for.
+            wait (bool): Whether to keep looking for new log entries until the job completes (default: False).
+            poll (int): The interval in seconds between polling for new log entries and job completion (default: 5).
+        Raises:
+            ValueError: If waiting and the transform job fails.
+        """
+
+        description = self.sagemaker_client.describe_transform_job(TransformJobName=job_name)
+        instance_count = description['TransformResources']['InstanceCount']
+        status = description['TransformJobStatus']
+
+        stream_names = []  # The list of log streams
+        positions = {}     # The current position in each stream, map of stream name -> position
+
+        # Increase retries allowed (from default of 4), as we don't want waiting for a training job
+        # to be interrupted by a transient exception.
+        config = botocore.config.Config(retries={'max_attempts': 15})
+        client = self.boto_session.client('logs', config=config)
+        log_group = '/aws/sagemaker/TransformJobs'
+
+        job_already_completed = True if status == 'Completed' or status == 'Failed' or status == 'Stopped' else False
+
+        state = LogState.TAILING if wait and not job_already_completed else LogState.COMPLETE
+        dot = False
+
+        color_wrap = sagemaker.logs.ColorWrap()
+
+        # The loop below implements a state machine that alternates between checking the job status and
+        # reading whatever is available in the logs at this point. Note, that if we were called with
+        # wait == False, we never check the job status.
+        #
+        # If wait == TRUE and job is not completed, the initial state is TAILING
+        # If wait == FALSE, the initial state is COMPLETE (doesn't matter if the job really is complete).
+        #
+        # The state table:
+        #
+        # STATE               ACTIONS                        CONDITION             NEW STATE
+        # ----------------    ----------------               -----------------     ----------------
+        # TAILING             Read logs, Pause, Get status   Job complete          JOB_COMPLETE
+        #                                                    Else                  TAILING
+        # JOB_COMPLETE        Read logs, Pause               Any                   COMPLETE
+        # COMPLETE            Read logs, Exit                                      N/A
+        #
+        # Notes:
+        # - The JOB_COMPLETE state forces us to do an extra pause and read any items that got to Cloudwatch after
+        #   the job was marked complete.
+        last_describe_job_call = time.time()
+        last_description = description
+        while True:
+            if len(stream_names) < instance_count:
+                # Log streams are created whenever a container starts writing to stdout/err, so this list
+                # may be dynamic until we have a stream for every instance.
+                try:
+                    streams = client.describe_log_streams(logGroupName=log_group, logStreamNamePrefix=job_name + '/',
+                                                          orderBy='LogStreamName', limit=instance_count)
+                    stream_names = [s['logStreamName'] for s in streams['logStreams']]
+                    positions.update([(s, sagemaker.logs.Position(timestamp=0, skip=0))
+                                      for s in stream_names if s not in positions])
+                except ClientError as e:
+                    # On the very first training job run on an account, there's no log group until
+                    # the container starts logging, so ignore any errors thrown about that
+                    err = e.response.get('Error', {})
+                    if err.get('Code', None) != 'ResourceNotFoundException':
+                        raise
+
+            if len(stream_names) > 0:
+                if dot:
+                    print('')
+                    dot = False
+                for idx, event in sagemaker.logs.multi_stream_iter(client, log_group, stream_names, positions):
+                    color_wrap(idx, event['message'])
+                    ts, count = positions[stream_names[idx]]
+                    if event['timestamp'] == ts:
+                        positions[stream_names[idx]] = sagemaker.logs.Position(timestamp=ts, skip=count + 1)
+                    else:
+                        positions[stream_names[idx]] = sagemaker.logs.Position(timestamp=event['timestamp'], skip=1)
+            else:
+                dot = True
+                print('.', end='')
+                sys.stdout.flush()
+            if state == LogState.COMPLETE:
+                break
+
+            time.sleep(poll)
+
+            if state == LogState.JOB_COMPLETE:
+                state = LogState.COMPLETE
+            elif time.time() - last_describe_job_call >= 30:
+                description = self.sagemaker_client.describe_transform_job(TransformJobName=job_name)
+                last_describe_job_call = time.time()
+
+                status = description['TransformJobStatus']
+
+                if status == 'Completed' or status == 'Failed' or status == 'Stopped':
+                    print()
+                    state = LogState.JOB_COMPLETE
+
+        if wait:
+            self._check_job_status(job_name, description, 'TransformJobStatus')
+            if dot:
+                print()
+            # Customers are not billed for hardware provisioning, so billable time is less than total time
+            billable_time = (description['TransformEndTime'] - description['TransformStartTime']) * instance_count
+            print('Billable seconds:', int(billable_time.total_seconds()) + 1)
+
 
 def container_def(image, model_data_url=None, env=None):
     """Create a definition for executing a container as part of a SageMaker model.
diff --git a/src/sagemaker/transformer.py b/src/sagemaker/transformer.py
@@ -79,7 +79,7 @@ def __init__(self, model_name, instance_count, instance_type, strategy=None, ass
         self.sagemaker_session = sagemaker_session or Session()
 
     def transform(self, data, data_type='S3Prefix', content_type=None, compression_type=None, split_type=None,
-                  job_name=None):
+                  wait=True, logs=True, job_name=None):
         """Start a new transform job.
 
         Args:
@@ -96,6 +96,9 @@ def transform(self, data, data_type='S3Prefix', content_type=None, compression_t
                 Valid values: 'Gzip', None.
             split_type (str): The record delimiter for the input object (default: 'None').
                 Valid values: 'None', 'Line', 'RecordIO', and 'TFRecord'.
+            wait (bool): Whether the call should wait until the job completes (default: True).
+            logs (bool): Whether to show the logs produced by the job.
+                Only meaningful when wait is True (default: True).
             job_name (str): job name (default: None). If not specified, one will be generated.
         """
         local_mode = self.sagemaker_session.local_mode
@@ -113,6 +116,8 @@ def transform(self, data, data_type='S3Prefix', content_type=None, compression_t
 
         self.latest_transform_job = _TransformJob.start_new(self, data, data_type, content_type, compression_type,
                                                             split_type)
+        if wait:
+            self.latest_transform_job.wait(logs=logs)
 
     def delete_model(self):
         """Delete the corresponding SageMaker model for this Transformer.
@@ -130,9 +135,9 @@ def _retrieve_image_name(self):
                              'Local instance types require locally created models.'
                              % self.model_name)
 
-    def wait(self):
+    def wait(self, logs=True):
         self._ensure_last_transform_job()
-        self.latest_transform_job.wait()
+        self.latest_transform_job.wait(logs=logs)
 
     def _ensure_last_transform_job(self):
         if self.latest_transform_job is None:
@@ -205,8 +210,11 @@ def start_new(cls, transformer, data, data_type, content_type, compression_type,
 
         return cls(transformer.sagemaker_session, transformer._current_job_name)
 
-    def wait(self):
-        self.sagemaker_session.wait_for_transform_job(self.job_name)
+    def wait(self, logs=True):
+        if logs:
+            self.sagemaker_session.logs_for_transform_job(self.job_name, wait=True)
+        else:
+            self.sagemaker_session.wait_for_transform_job(self.job_name)
 
     @staticmethod
     def _load_config(data, data_type, content_type, compression_type, split_type, transformer):