Merge pull request #1862 from mozilla/split_tf_deps

reuben · web-flow · commit 4356704a768d · 2019-02-05T09:36:38.000-02:00
Clean up and split TensorFlow deps out of util/text.py
diff --git a/evaluate.py b/evaluate.py
@@ -22,7 +22,7 @@
 from util.flags import create_flags, FLAGS
 from util.logging import log_error
 from util.preprocess import pmap, preprocess
-from util.text import Alphabet, ctc_label_dense_to_sparse, wer, levenshtein
+from util.text import Alphabet, wer_cer_batch, levenshtein
 
 
 def split_data(dataset, batch_size):
@@ -47,15 +47,14 @@ def pad_to_dense(jagged):
 
 def process_decode_result(item):
     label, decoding, distance, loss = item
-    sample_wer = wer(label, decoding)
+    word_distance = levenshtein(label.split(), decoding.split())
+    word_length = float(len(label.split()))
     return AttrDict({
         'src': label,
         'res': decoding,
         'loss': loss,
         'distance': distance,
-        'wer': sample_wer,
-        'levenshtein': levenshtein(label.split(), decoding.split()),
-        'label_length': float(len(label.split())),
+        'wer': word_distance / word_length,
     })
 
 
@@ -67,19 +66,16 @@ def calculate_report(labels, decodings, distances, losses):
     '''
     samples = pmap(process_decode_result, zip(labels, decodings, distances, losses))
 
-    total_levenshtein = sum(s.levenshtein for s in samples)
-    total_label_length = sum(s.label_length for s in samples)
-
-    # Getting the WER from the accumulated levenshteins and lengths
-    samples_wer = total_levenshtein / total_label_length
+    # Getting the WER and CER from the accumulated edit distances and lengths
+    samples_wer, samples_cer = wer_cer_batch(labels, decodings)
 
     # Order the remaining items by their loss (lowest loss on top)
     samples.sort(key=lambda s: s.loss)
 
     # Then order by WER (highest WER on top)
     samples.sort(key=lambda s: s.wer, reverse=True)
 
-    return samples_wer, samples
+    return samples_wer, samples_cer, samples
 
 
 def evaluate(test_data, inference_graph):
@@ -114,7 +110,14 @@ def create_windows(features):
         labels_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size, None], name="labels")
         label_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size], name="label_lengths")
 
-        sparse_labels = tf.cast(ctc_label_dense_to_sparse(labels_ph, label_lengths_ph, FLAGS.test_batch_size), tf.int32)
+        # We add 1 to all elements of the transcript to avoid any zero values
+        # since we use that as an end-of-sequence token for converting the batch
+        # into a SparseTensor. So here we convert the placeholder back into a
+        # SparseTensor and subtract ones to get the real labels.
+        sparse_labels = tf.contrib.layers.dense_to_sparse(labels_ph)
+        neg_ones = tf.SparseTensor(sparse_labels.indices, -1 * tf.ones_like(sparse_labels.values), sparse_labels.dense_shape)
+        sparse_labels = tf.sparse_add(sparse_labels, neg_ones)
+
         loss = tf.nn.ctc_loss(labels=sparse_labels,
                               inputs=layers['raw_logits'],
                               sequence_length=inputs['input_lengths'])
@@ -146,7 +149,7 @@ def create_windows(features):
 
             features = pad_to_dense(batch['features'].values)
             features_len = batch['features_len'].values
-            labels = pad_to_dense(batch['transcript'].values)
+            labels = pad_to_dense(batch['transcript'].values + 1)
             label_lengths = batch['transcript_len'].values
 
             logits, loss_ = session.run([transposed, loss], feed_dict={
@@ -183,15 +186,14 @@ def create_windows(features):
 
     distances = [levenshtein(a, b) for a, b in zip(ground_truths, predictions)]
 
-    wer, samples = calculate_report(ground_truths, predictions, distances, losses)
-    mean_edit_distance = np.mean(distances)
+    wer, cer, samples = calculate_report(ground_truths, predictions, distances, losses)
     mean_loss = np.mean(losses)
 
     # Take only the first report_count items
     report_samples = itertools.islice(samples, FLAGS.report_count)
 
     print('Test - WER: %f, CER: %f, loss: %f' %
-          (wer, mean_edit_distance, mean_loss))
+          (wer, cer, mean_loss))
     print('-' * 80)
     for sample in report_samples:
         print('WER: %f, CER: %f, loss: %f' %
diff --git a/util/feeding.py b/util/feeding.py
@@ -5,7 +5,6 @@
 from six.moves import range
 from threading import Thread
 from util.gpu import get_available_gpus
-from util.text import ctc_label_dense_to_sparse
 
 
 class ModelFeeder(object):
@@ -143,11 +142,14 @@ def _populate_batch_queue(self, session, coord):
                 (features.strides[0], features.strides[0], features.strides[1]),
                 writeable=False)
 
+            # We add 1 to all elements of the transcript here to avoid any zero
+            # values since we use that as an end-of-sequence token for converting
+            # the batch into a SparseTensor.
             try:
                 session.run(self._enqueue_op, feed_dict={
                     self._model_feeder.ph_x: features,
                     self._model_feeder.ph_x_length: num_strides,
-                    self._model_feeder.ph_y: transcript,
+                    self._model_feeder.ph_y: transcript + 1,
                     self._model_feeder.ph_y_length: transcript_len
                 })
             except tf.errors.CancelledError:
@@ -173,8 +175,10 @@ def next_batch(self):
         Draw the next batch from from the combined switchable queue.
         '''
         source, source_lengths, target, target_lengths = self._queue.dequeue_many(self._model_feeder.ph_batch_size)
-        sparse_labels = ctc_label_dense_to_sparse(target, target_lengths, self._model_feeder.ph_batch_size)
-        return source, source_lengths, sparse_labels
+        # Back to sparse, then subtract one to get the real labels
+        sparse_labels = tf.contrib.layers.dense_to_sparse(target)
+        neg_ones = tf.SparseTensor(sparse_labels.indices, -1 * tf.ones_like(sparse_labels.values), sparse_labels.dense_shape)
+        return source, source_lengths, tf.sparse_add(sparse_labels, neg_ones)
 
     def start_queue_threads(self, session, coord):
         '''
diff --git a/util/text.py b/util/text.py
@@ -2,12 +2,10 @@
 
 import codecs
 import numpy as np
-import tensorflow as tf
 import re
 import sys
 
 from six.moves import range
-from functools import reduce
 
 class Alphabet(object):
     def __init__(self, config_file):
@@ -56,74 +54,42 @@ def size(self):
     def config_file(self):
         return self._config_file
 
+
 def text_to_char_array(original, alphabet):
     r"""
     Given a Python string ``original``, remove unsupported characters, map characters
     to integers and return a numpy array representing the processed string.
     """
     return np.asarray([alphabet.label_from_string(c) for c in original])
 
-def sparse_tuple_from(sequences, dtype=np.int32):
-    r"""Creates a sparse representention of ``sequences``.
-    Args:
-        * sequences: a list of lists of type dtype where each element is a sequence
-
-    Returns a tuple with (indices, values, shape)
-    """
-    indices = []
-    values = []
-
-    for n, seq in enumerate(sequences):
-        indices.extend(zip([n]*len(seq), range(len(seq))))
-        values.extend(seq)
 
-    indices = np.asarray(indices, dtype=np.int64)
-    values = np.asarray(values, dtype=dtype)
-    shape = np.asarray([len(sequences), indices.max(0)[1]+1], dtype=np.int64)
-
-    return tf.SparseTensor(indices=indices, values=values, shape=shape)
-
-def sparse_tensor_value_to_texts(value, alphabet):
-    r"""
-    Given a :class:`tf.SparseTensor` ``value``, return an array of Python strings
-    representing its values.
-    """
-    return sparse_tuple_to_texts((value.indices, value.values, value.dense_shape), alphabet)
-
-def sparse_tuple_to_texts(tuple, alphabet):
-    indices = tuple[0]
-    values = tuple[1]
-    results = [''] * tuple[2][0]
-    for i in range(len(indices)):
-        index = indices[i][0]
-        results[index] += alphabet.string_from_label(values[i])
-    # List of strings
-    return results
-
-def wer(original, result):
+def wer_cer_batch(originals, results):
     r"""
     The WER is defined as the editing/Levenshtein distance on word level
     divided by the amount of words in the original text.
     In case of the original having more words (N) than the result and both
     being totally different (all N words resulting in 1 edit operation each),
     the WER will always be 1 (N / N = 1).
     """
-    # The WER ist calculated on word (and NOT on character) level.
-    # Therefore we split the strings into words first:
-    original = original.split()
-    result = result.split()
-    return levenshtein(original, result) / float(len(original))
-
-def wers(originals, results):
-    count = len(originals)
-    rates = []
-    mean = 0.0
-    assert count == len(results)
-    for i in range(count):
-        rate = wer(originals[i], results[i])
-        mean = mean + rate
-        rates.append(rate)
-    return rates, mean / float(count)
+    # The WER is calculated on word (and NOT on character) level.
+    # Therefore we split the strings into words first
+    assert len(originals) == len(results)
+
+    total_cer = 0.0
+    total_char_length = 0.0
+
+    total_wer = 0.0
+    total_word_length = 0.0
+
+    for original, result in zip(originals, results):
+        total_cer += levenshtein(original, result)
+        total_char_length += len(original)
+
+        total_wer += levenshtein(original.split(), result.split())
+        total_word_length += len(original.split())
+
+    return total_wer / total_word_length, total_cer / total_char_length
+
 
 # The following code is from: http://hetland.org/coding/python/levenshtein.py
 
@@ -155,55 +121,6 @@ def levenshtein(a,b):
 
     return current[n]
 
-# gather_nd is taken from https://github.com/tensorflow/tensorflow/issues/206#issuecomment-229678962
-# 
-# Unfortunately we can't just use tf.gather_nd because it does not have gradients
-# implemented yet, so we need this workaround.
-#
-def gather_nd(params, indices, shape):
-    rank = len(shape)
-    flat_params = tf.reshape(params, [-1])
-    multipliers = [reduce(lambda x, y: x*y, shape[i+1:], 1) for i in range(0, rank)]
-    indices_unpacked = tf.unstack(tf.transpose(indices, [rank - 1] + list(range(0, rank - 1))))
-    flat_indices = sum([a*b for a,b in zip(multipliers, indices_unpacked)])
-    return tf.gather(flat_params, flat_indices)
-
-# ctc_label_dense_to_sparse is taken from https://github.com/tensorflow/tensorflow/issues/1742#issuecomment-205291527
-#
-# The CTC implementation in TensorFlow needs labels in a sparse representation,
-# but sparse data and queues don't mix well, so we store padded tensors in the
-# queue and convert to a sparse representation after dequeuing a batch.
-#
-def ctc_label_dense_to_sparse(labels, label_lengths, batch_size):
-    # The second dimension of labels must be equal to the longest label length in the batch
-    correct_shape_assert = tf.assert_equal(tf.shape(labels)[1], tf.reduce_max(label_lengths))
-    with tf.control_dependencies([correct_shape_assert]):
-        labels = tf.identity(labels)
-
-    label_shape = tf.shape(labels)
-    num_batches_tns = tf.stack([label_shape[0]])
-    max_num_labels_tns = tf.stack([label_shape[1]])
-    def range_less_than(previous_state, current_input):
-        return tf.expand_dims(tf.range(label_shape[1]), 0) < current_input
-
-    init = tf.cast(tf.fill(max_num_labels_tns, 0), tf.bool)
-    init = tf.expand_dims(init, 0)
-    dense_mask = tf.scan(range_less_than, label_lengths, initializer=init, parallel_iterations=1)
-    dense_mask = dense_mask[:, 0, :]
-
-    label_array = tf.reshape(tf.tile(tf.range(0, label_shape[1]), num_batches_tns),
-          label_shape)
-    label_ind = tf.boolean_mask(label_array, dense_mask)
-
-    batch_array = tf.transpose(tf.reshape(tf.tile(tf.range(0, label_shape[0]), max_num_labels_tns), tf.reverse(label_shape, [0])))
-    batch_ind = tf.boolean_mask(batch_array, dense_mask)
-
-    indices = tf.transpose(tf.reshape(tf.concat([batch_ind, label_ind], 0), [2, -1]))
-    shape = [batch_size, tf.reduce_max(label_lengths)]
-    vals_sparse = gather_nd(labels, indices, shape)
-
-    return tf.SparseTensor(tf.to_int64(indices), vals_sparse, tf.to_int64(label_shape))
-
 # Validate and normalize transcriptions. Returns a cleaned version of the label
 # or None if it's invalid.
 def validate_label(label):