Clean up and split TensorFlow deps of text.py

reuben · reuben · commit 7a14bcc4deeb · 2019-02-04T08:35:43.000-02:00
diff --git a/evaluate.py b/evaluate.py
@@ -19,10 +19,11 @@
 from six.moves import zip, range
 from util.audio import audiofile_to_input_vector
 from util.config import Config, initialize_globals
+from util.ctc import ctc_label_dense_to_sparse
 from util.flags import create_flags, FLAGS
 from util.logging import log_error
 from util.preprocess import pmap, preprocess
-from util.text import Alphabet, ctc_label_dense_to_sparse, wer, levenshtein
+from util.text import Alphabet, wer_cer_batch, levenshtein
 
 
 def split_data(dataset, batch_size):
@@ -47,15 +48,14 @@ def pad_to_dense(jagged):
 
 def process_decode_result(item):
     label, decoding, distance, loss = item
-    sample_wer = wer(label, decoding)
+    word_distance = levenshtein(label.split(), decoding.split())
+    word_length = float(len(label.split()))
     return AttrDict({
         'src': label,
         'res': decoding,
         'loss': loss,
         'distance': distance,
-        'wer': sample_wer,
-        'levenshtein': levenshtein(label.split(), decoding.split()),
-        'label_length': float(len(label.split())),
+        'wer': word_distance / word_length,
     })
 
 
@@ -67,19 +67,16 @@ def calculate_report(labels, decodings, distances, losses):
     '''
     samples = pmap(process_decode_result, zip(labels, decodings, distances, losses))
 
-    total_levenshtein = sum(s.levenshtein for s in samples)
-    total_label_length = sum(s.label_length for s in samples)
-
-    # Getting the WER from the accumulated levenshteins and lengths
-    samples_wer = total_levenshtein / total_label_length
+    # Getting the WER and CER from the accumulated edit distances and lengths
+    samples_wer, samples_cer = wer_cer_batch(labels, decodings)
 
     # Order the remaining items by their loss (lowest loss on top)
     samples.sort(key=lambda s: s.loss)
 
     # Then order by WER (highest WER on top)
     samples.sort(key=lambda s: s.wer, reverse=True)
 
-    return samples_wer, samples
+    return samples_wer, samples_cer, samples
 
 
 def evaluate(test_data, inference_graph):
@@ -183,15 +180,14 @@ def create_windows(features):
 
     distances = [levenshtein(a, b) for a, b in zip(ground_truths, predictions)]
 
-    wer, samples = calculate_report(ground_truths, predictions, distances, losses)
-    mean_edit_distance = np.mean(distances)
+    wer, cer, samples = calculate_report(ground_truths, predictions, distances, losses)
     mean_loss = np.mean(losses)
 
     # Take only the first report_count items
     report_samples = itertools.islice(samples, FLAGS.report_count)
 
     print('Test - WER: %f, CER: %f, loss: %f' %
-          (wer, mean_edit_distance, mean_loss))
+          (wer, cer, mean_loss))
     print('-' * 80)
     for sample in report_samples:
         print('WER: %f, CER: %f, loss: %f' %
diff --git a/util/ctc.py b/util/ctc.py
@@ -0,0 +1,57 @@
+from __future__ import absolute_import, division, print_function
+
+import tensorflow as tf
+
+from functools import reduce
+from six.moves import range
+
+
+# gather_nd is taken from https://github.com/tensorflow/tensorflow/issues/206#issuecomment-229678962
+#
+# Unfortunately we can't just use tf.gather_nd because it does not have gradients
+# implemented yet, so we need this workaround.
+#
+def gather_nd(params, indices, shape):
+    rank = len(shape)
+    flat_params = tf.reshape(params, [-1])
+    multipliers = [reduce(lambda x, y: x*y, shape[i+1:], 1) for i in range(0, rank)]
+    indices_unpacked = tf.unstack(tf.transpose(indices, [rank - 1] + list(range(0, rank - 1))))
+    flat_indices = sum([a*b for a,b in zip(multipliers, indices_unpacked)])
+    return tf.gather(flat_params, flat_indices)
+
+
+# ctc_label_dense_to_sparse is taken from https://github.com/tensorflow/tensorflow/issues/1742#issuecomment-205291527
+#
+# The CTC implementation in TensorFlow needs labels in a sparse representation,
+# but sparse data and queues don't mix well, so we store padded tensors in the
+# queue and convert to a sparse representation after dequeuing a batch.
+#
+def ctc_label_dense_to_sparse(labels, label_lengths, batch_size):
+    # The second dimension of labels must be equal to the longest label length in the batch
+    correct_shape_assert = tf.assert_equal(tf.shape(labels)[1], tf.reduce_max(label_lengths))
+    with tf.control_dependencies([correct_shape_assert]):
+        labels = tf.identity(labels)
+
+    label_shape = tf.shape(labels)
+    num_batches_tns = tf.stack([label_shape[0]])
+    max_num_labels_tns = tf.stack([label_shape[1]])
+    def range_less_than(previous_state, current_input):
+        return tf.expand_dims(tf.range(label_shape[1]), 0) < current_input
+
+    init = tf.cast(tf.fill(max_num_labels_tns, 0), tf.bool)
+    init = tf.expand_dims(init, 0)
+    dense_mask = tf.scan(range_less_than, label_lengths, initializer=init, parallel_iterations=1)
+    dense_mask = dense_mask[:, 0, :]
+
+    label_array = tf.reshape(tf.tile(tf.range(0, label_shape[1]), num_batches_tns),
+          label_shape)
+    label_ind = tf.boolean_mask(label_array, dense_mask)
+
+    batch_array = tf.transpose(tf.reshape(tf.tile(tf.range(0, label_shape[0]), max_num_labels_tns), tf.reverse(label_shape, [0])))
+    batch_ind = tf.boolean_mask(batch_array, dense_mask)
+
+    indices = tf.transpose(tf.reshape(tf.concat([batch_ind, label_ind], 0), [2, -1]))
+    shape = [batch_size, tf.reduce_max(label_lengths)]
+    vals_sparse = gather_nd(labels, indices, shape)
+
+    return tf.SparseTensor(tf.to_int64(indices), vals_sparse, tf.to_int64(label_shape))
diff --git a/util/feeding.py b/util/feeding.py
@@ -4,8 +4,8 @@
 from math import ceil
 from six.moves import range
 from threading import Thread
+from util.ctc import ctc_label_dense_to_sparse
 from util.gpu import get_available_gpus
-from util.text import ctc_label_dense_to_sparse
 
 
 class ModelFeeder(object):
diff --git a/util/text.py b/util/text.py
@@ -2,12 +2,10 @@
 
 import codecs
 import numpy as np
-import tensorflow as tf
 import re
 import sys
 
 from six.moves import range
-from functools import reduce
 
 class Alphabet(object):
     def __init__(self, config_file):
@@ -56,74 +54,39 @@ def size(self):
     def config_file(self):
         return self._config_file
 
+
 def text_to_char_array(original, alphabet):
     r"""
     Given a Python string ``original``, remove unsupported characters, map characters
     to integers and return a numpy array representing the processed string.
     """
     return np.asarray([alphabet.label_from_string(c) for c in original])
 
-def sparse_tuple_from(sequences, dtype=np.int32):
-    r"""Creates a sparse representention of ``sequences``.
-    Args:
-        * sequences: a list of lists of type dtype where each element is a sequence
-
-    Returns a tuple with (indices, values, shape)
-    """
-    indices = []
-    values = []
-
-    for n, seq in enumerate(sequences):
-        indices.extend(zip([n]*len(seq), range(len(seq))))
-        values.extend(seq)
 
-    indices = np.asarray(indices, dtype=np.int64)
-    values = np.asarray(values, dtype=dtype)
-    shape = np.asarray([len(sequences), indices.max(0)[1]+1], dtype=np.int64)
-
-    return tf.SparseTensor(indices=indices, values=values, shape=shape)
-
-def sparse_tensor_value_to_texts(value, alphabet):
-    r"""
-    Given a :class:`tf.SparseTensor` ``value``, return an array of Python strings
-    representing its values.
-    """
-    return sparse_tuple_to_texts((value.indices, value.values, value.dense_shape), alphabet)
-
-def sparse_tuple_to_texts(tuple, alphabet):
-    indices = tuple[0]
-    values = tuple[1]
-    results = [''] * tuple[2][0]
-    for i in range(len(indices)):
-        index = indices[i][0]
-        results[index] += alphabet.string_from_label(values[i])
-    # List of strings
-    return results
-
-def wer(original, result):
+def wer_cer_batch(originals, results):
     r"""
     The WER is defined as the editing/Levenshtein distance on word level
     divided by the amount of words in the original text.
     In case of the original having more words (N) than the result and both
     being totally different (all N words resulting in 1 edit operation each),
     the WER will always be 1 (N / N = 1).
     """
-    # The WER ist calculated on word (and NOT on character) level.
-    # Therefore we split the strings into words first:
-    original = original.split()
-    result = result.split()
-    return levenshtein(original, result) / float(len(original))
-
-def wers(originals, results):
-    count = len(originals)
-    rates = []
-    mean = 0.0
-    assert count == len(results)
-    for i in range(count):
-        rate = wer(originals[i], results[i])
-        mean = mean + rate
-        rates.append(rate)
-    return rates, mean / float(count)
+    # The WER is calculated on word (and NOT on character) level.
+    # Therefore we split the strings into words first
+    assert len(originals) == len(results)
+
+    total_cer = 0.0
+
+    total_wer = 0.0
+    total_word_length = 0.0
+
+    for original, result in zip(originals, results):
+        total_cer += levenshtein(original, result)
+
+        total_wer += levenshtein(original.split(), result.split())
+        total_word_length += len(original.split())
+
+    return total_wer / total_word_length, total_cer / len(originals)
 
 # The following code is from: http://hetland.org/coding/python/levenshtein.py
 
@@ -155,55 +118,6 @@ def levenshtein(a,b):
 
     return current[n]
 
-# gather_nd is taken from https://github.com/tensorflow/tensorflow/issues/206#issuecomment-229678962
-# 
-# Unfortunately we can't just use tf.gather_nd because it does not have gradients
-# implemented yet, so we need this workaround.
-#
-def gather_nd(params, indices, shape):
-    rank = len(shape)
-    flat_params = tf.reshape(params, [-1])
-    multipliers = [reduce(lambda x, y: x*y, shape[i+1:], 1) for i in range(0, rank)]
-    indices_unpacked = tf.unstack(tf.transpose(indices, [rank - 1] + list(range(0, rank - 1))))
-    flat_indices = sum([a*b for a,b in zip(multipliers, indices_unpacked)])
-    return tf.gather(flat_params, flat_indices)
-
-# ctc_label_dense_to_sparse is taken from https://github.com/tensorflow/tensorflow/issues/1742#issuecomment-205291527
-#
-# The CTC implementation in TensorFlow needs labels in a sparse representation,
-# but sparse data and queues don't mix well, so we store padded tensors in the
-# queue and convert to a sparse representation after dequeuing a batch.
-#
-def ctc_label_dense_to_sparse(labels, label_lengths, batch_size):
-    # The second dimension of labels must be equal to the longest label length in the batch
-    correct_shape_assert = tf.assert_equal(tf.shape(labels)[1], tf.reduce_max(label_lengths))
-    with tf.control_dependencies([correct_shape_assert]):
-        labels = tf.identity(labels)
-
-    label_shape = tf.shape(labels)
-    num_batches_tns = tf.stack([label_shape[0]])
-    max_num_labels_tns = tf.stack([label_shape[1]])
-    def range_less_than(previous_state, current_input):
-        return tf.expand_dims(tf.range(label_shape[1]), 0) < current_input
-
-    init = tf.cast(tf.fill(max_num_labels_tns, 0), tf.bool)
-    init = tf.expand_dims(init, 0)
-    dense_mask = tf.scan(range_less_than, label_lengths, initializer=init, parallel_iterations=1)
-    dense_mask = dense_mask[:, 0, :]
-
-    label_array = tf.reshape(tf.tile(tf.range(0, label_shape[1]), num_batches_tns),
-          label_shape)
-    label_ind = tf.boolean_mask(label_array, dense_mask)
-
-    batch_array = tf.transpose(tf.reshape(tf.tile(tf.range(0, label_shape[0]), max_num_labels_tns), tf.reverse(label_shape, [0])))
-    batch_ind = tf.boolean_mask(batch_array, dense_mask)
-
-    indices = tf.transpose(tf.reshape(tf.concat([batch_ind, label_ind], 0), [2, -1]))
-    shape = [batch_size, tf.reduce_max(label_lengths)]
-    vals_sparse = gather_nd(labels, indices, shape)
-
-    return tf.SparseTensor(tf.to_int64(indices), vals_sparse, tf.to_int64(label_shape))
-
 # Validate and normalize transcriptions. Returns a cleaned version of the label
 # or None if it's invalid.
 def validate_label(label):