chg: Use lists for y case to allow multiple labels

ivyleavedtoadflax · ivyleavedtoadflax · commit a3323e5816c3 · 2020-03-24T17:26:40.000-03:00
diff --git a/deep_reference_parser/deep_reference_parser.py b/deep_reference_parser/deep_reference_parser.py
@@ -72,8 +72,8 @@ def __init__(
         y_train=None,
         y_test=None,
         y_valid=None,
-        digits_word="$NUM$",
-        ukn_words="out-of-vocabulary",
+        digits_word="<NUM>",
+        ukn_words="<OOV>",
         padding_style="pre",
         output_path="data/model_output",
     ):
@@ -165,13 +165,11 @@ def prepare_data(self, save=False):
         # Compute indexes for words+labels in the training data
 
         self.word2ind, self.ind2word = index_x(self.X_train_merged, self.ukn_words)
-        self.label2ind, ind2label = index_y(self.y_train)
 
-        # NOTE: The original code expected self.ind2label to be a list,
-        # in case you are training a multi-task model. For this reason,
-        # self.index2label is wrapped in a list.
+        y_labels = list(map(index_y, self.y_train))
 
-        self.ind2label.append(ind2label)
+        self.ind2label = [ind2label for _, ind2label in y_labels]
+        self.label2ind = [label2ind for label2ind, _ in y_labels]
 
         # Convert data into indexes data
 
@@ -209,21 +207,41 @@ def prepare_data(self, save=False):
 
         # Encode y variables
 
-        self.y_train_encoded = encode_y(
-            self.y_train, self.label2ind, self.max_len, self.padding_style
-        )
+        for i, labels in enumerate(self.y_train):
+            self.y_train_encoded.append(
+                encode_y(
+                    labels,
+                    self.label2ind[i],
+                    self.max_len,
+                    self.padding_style
+                )
+            )
 
-        self.y_test_encoded = encode_y(
-            self.y_test, self.label2ind, self.max_len, self.padding_style
-        )
+        for i, labels in enumerate(self.y_test):
+            self.y_test_encoded.append(
+                encode_y(
+                    labels,
+                    self.label2ind[i],
+                    self.max_len,
+                    self.padding_style
+                )
+            )
 
-        self.y_valid_encoded = encode_y(
-            self.y_valid, self.label2ind, self.max_len, self.padding_style
-        )
+        for i, labels in enumerate(self.y_valid):
+            self.y_valid_encoded.append(
+                encode_y(
+                    labels,
+                    self.label2ind[i],
+                    self.max_len,
+                    self.padding_style
+                )
+            )
+
+
+        logger.debug("Training target dimensions: %s", self.y_train_encoded[0].shape)
+        logger.debug("Test target dimensions: %s", self.y_test_encoded[0].shape)
+        logger.debug("Validation target dimensions: %s", self.y_valid_encoded[0].shape)
 
-        logger.debug("Training target dimensions: %s", self.y_train_encoded.shape)
-        logger.debug("Test target dimensions: %s", self.y_test_encoded.shape)
-        logger.debug("Validation target dimensions: %s", self.y_valid_encoded.shape)
 
         # Create character level data
 
@@ -456,7 +474,7 @@ def build_model(
 
         self.model = model
 
-#        logger.debug(self.model.summary(line_length=150))
+        logger.debug(self.model.summary(line_length=150))
 
     def train_model(
         self, epochs=25, batch_size=100, early_stopping_patience=5, metric="val_f1"
@@ -481,10 +499,8 @@ def train_model(
 
         # Use custom classification scores callback
 
-        # NOTE: X lists are important for input here
-
         classification_scores = Classification_Scores(
-            [self.X_training, [self.y_train_encoded]], self.ind2label, self.weights_path
+            [self.X_training, self.y_train_encoded], self.ind2label, self.weights_path
         )
 
         callbacks.append(classification_scores)
@@ -503,12 +519,12 @@ def train_model(
 
         hist = self.model.fit(
             x=self.X_training,
-            y=[self.y_train_encoded],
-            validation_data=[self.X_testing, [self.y_test_encoded]],
+            y=self.y_train_encoded,
+            validation_data=[self.X_testing, self.y_test_encoded],
             epochs=epochs,
             batch_size=batch_size,
             callbacks=callbacks,
-            verbose=2,
+            verbose=1,
         )
 
         logger.info(
diff --git a/deep_reference_parser/model_utils.py b/deep_reference_parser/model_utils.py
@@ -154,9 +154,9 @@ def encode_y(y, label2ind, max_len, padding_style):
 
     # Encode y (with pad)
 
-    # Transform each label into its index in the data
+    # Transform each label into its index and adding "pre" padding
 
-    y_pad = [[0] * (max_len - len(ey)) + [label2ind[c] for c in ey] for ey in y]
+    y_pad = [[0] * (max_len - len(yi)) + [label2ind[label] for label in yi] for yi in y]
 
     # One-hot-encode label
 
diff --git a/deep_reference_parser/train.py b/deep_reference_parser/train.py
@@ -70,13 +70,17 @@ def train(config_file):
 
     # Load policy data
 
-    X_train, y_train = load_tsv(POLICY_TRAIN)
-    X_test, y_test = load_tsv(POLICY_TEST)
-    X_valid, y_valid = load_tsv(POLICY_VALID)
+    train_data = load_tsv(POLICY_TRAIN)
+    test_data = load_tsv(POLICY_TEST)
+    valid_data = load_tsv(POLICY_VALID)
 
-    logger.info("X_train, y_train examples: %s, %s", len(X_train), len(y_train))
-    logger.info("X_test, y_test  examples: %s, %s", len(X_test), len(y_test))
-    logger.info("X_valid, y_valid  examples: %s, %s", len(X_valid), len(y_valid))
+    X_train, y_train = train_data[0], train_data[1:]
+    X_test, y_test = test_data[0], test_data[1:]
+    X_valid, y_valid = valid_data[0], valid_data[1:]
+
+    logger.info("X_train, y_train examples: %s, %s", len(X_train), list(map(len, y_train)))
+    logger.info("X_test, y_test examples: %s, %s", len(X_test), list(map(len, y_test)))
+    logger.info("X_valid, y_valid examples: %s, %s", len(X_valid), list(map(len, y_valid)))
 
     drp = DeepReferenceParser(
         X_train=X_train,