Use tf.contrib.layers.dense_to_sparse instead of util/ctc.py

reuben · reuben · commit f3613da82ae7 · 2019-02-04T09:19:48.000-02:00
diff --git a/evaluate.py b/evaluate.py
@@ -19,7 +19,6 @@
 from six.moves import zip, range
 from util.audio import audiofile_to_input_vector
 from util.config import Config, initialize_globals
-from util.ctc import ctc_label_dense_to_sparse
 from util.flags import create_flags, FLAGS
 from util.logging import log_error
 from util.preprocess import pmap, preprocess
@@ -111,7 +110,14 @@ def create_windows(features):
         labels_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size, None], name="labels")
         label_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size], name="label_lengths")
 
-        sparse_labels = tf.cast(ctc_label_dense_to_sparse(labels_ph, label_lengths_ph, FLAGS.test_batch_size), tf.int32)
+        # We add 1 to all elements of the transcript to avoid any zero values
+        # since we use that as an end-of-sequence token for converting the batch
+        # into a SparseTensor. So here we convert the placeholder back into a
+        # SparseTensor and subtract ones to get the real labels.
+        sparse_labels = tf.contrib.layers.dense_to_sparse(labels_ph)
+        neg_ones = tf.SparseTensor(sparse_labels.indices, -1 * tf.ones_like(sparse_labels.values), sparse_labels.dense_shape)
+        sparse_labels = tf.sparse_add(sparse_labels, neg_ones)
+
         loss = tf.nn.ctc_loss(labels=sparse_labels,
                               inputs=layers['raw_logits'],
                               sequence_length=inputs['input_lengths'])
@@ -143,7 +149,7 @@ def create_windows(features):
 
             features = pad_to_dense(batch['features'].values)
             features_len = batch['features_len'].values
-            labels = pad_to_dense(batch['transcript'].values)
+            labels = pad_to_dense(batch['transcript'].values + 1)
             label_lengths = batch['transcript_len'].values
 
             logits, loss_ = session.run([transposed, loss], feed_dict={
diff --git a/util/ctc.py b/util/ctc.py
diff --git a/util/feeding.py b/util/feeding.py
@@ -4,7 +4,6 @@
 from math import ceil
 from six.moves import range
 from threading import Thread
-from util.ctc import ctc_label_dense_to_sparse
 from util.gpu import get_available_gpus
 
 
@@ -143,11 +142,14 @@ def _populate_batch_queue(self, session, coord):
                 (features.strides[0], features.strides[0], features.strides[1]),
                 writeable=False)
 
+            # We add 1 to all elements of the transcript here to avoid any zero
+            # values since we use that as an end-of-sequence token for converting
+            # the batch into a SparseTensor.
             try:
                 session.run(self._enqueue_op, feed_dict={
                     self._model_feeder.ph_x: features,
                     self._model_feeder.ph_x_length: num_strides,
-                    self._model_feeder.ph_y: transcript,
+                    self._model_feeder.ph_y: transcript + 1,
                     self._model_feeder.ph_y_length: transcript_len
                 })
             except tf.errors.CancelledError:
@@ -173,8 +175,10 @@ def next_batch(self):
         Draw the next batch from from the combined switchable queue.
         '''
         source, source_lengths, target, target_lengths = self._queue.dequeue_many(self._model_feeder.ph_batch_size)
-        sparse_labels = ctc_label_dense_to_sparse(target, target_lengths, self._model_feeder.ph_batch_size)
-        return source, source_lengths, sparse_labels
+        # Back to sparse, then subtract one to get the real labels
+        sparse_labels = tf.contrib.layers.dense_to_sparse(target)
+        neg_ones = tf.SparseTensor(sparse_labels.indices, -1 * tf.ones_like(sparse_labels.values), sparse_labels.dense_shape)
+        return source, source_lengths, tf.sparse_add(sparse_labels, neg_ones)
 
     def start_queue_threads(self, session, coord):
         '''