wellcometrust
diff --git a/‎deep_reference_parser/io/io.py
Lines changed: 44 additions & 10 deletions b/‎deep_reference_parser/io/io.py
Lines changed: 44 additions & 10 deletions
@@ -13,18 +13,49 @@
 
 from ..logger import logger
 
-def _split_list_by_linebreaks(tokens):
+def _unpack(tuples):
+    """Convert list of tuples into the correct format:
+
+    From:
+
+        [
+            (
+                (token0, token1, token2, token3),
+                (label0, label1, label2, label3),
+            ),
+            (
+                (token0, token1, token2),
+                (label0, label1, label2),
+            ),
+        )
+
+    to:
+        ]
+            (
+                (token0, token1, token2, token3),
+                (token0, token1, token2),
+            ),
+            (
+                (label0, label1, label2, label3),
+                (label0, label1, label2),
+            ),
+        ]
+    """
+    return list(zip(*list(tuples)))
+
+def _split_list_by_linebreaks(rows):
     """Cycle through a list of tokens (or labels) and split them into lists
     based on the presence of Nones or more likely math.nan caused by converting
     pd.DataFrame columns to lists.
     """
     out = []
-    tokens_gen = iter(tokens)
+    rows_gen = iter(rows)
     while True:
         try:
-            token = next(tokens_gen)
+            row = next(rows_gen)
+            token = row[0]
             if isinstance(token, str) and token:
-                out.append(token)
+                out.append(row)
             else:
                 yield out
                 out = []
@@ -40,10 +71,8 @@ def load_tsv(filepath, split_char="\t"):
     Expects data in the following format (tab separations).
 
       References   o       o
-                   o       o
                1   o       o
                .   o       o
-                   o       o
              WHO   title   b-r
        treatment   title   i-r
       guidelines   title   i-r
@@ -55,8 +84,6 @@ def load_tsv(filepath, split_char="\t"):
                ,   title   i-r
             2016   title   i-r
 
-
-
     Args:
         filepath (str): Path to the data.
         split_char(str): Character to be used to split each line of the
@@ -67,9 +94,16 @@ def load_tsv(filepath, split_char="\t"):
         filepath.
 
     """
-
     df = pd.read_csv(filepath, delimiter=split_char, header=None, skip_blank_lines=False)
-    out = [list(_split_list_by_linebreaks(column)) for _, column in df.iteritems()]
+    tuples = _split_list_by_linebreaks(df.to_records(index=False))
+
+    # Remove leading empty lists if found
+
+    tuples = list(filter(None, tuples))
+
+    unpacked_tuples = list(map(_unpack, tuples))
+
+    out = _unpack(unpacked_tuples)
 
     logger.info("Loaded %s training examples", len(out[0]))