13
13
14
14
from ..logger import logger
15
15
16
- def _split_list_by_linebreaks (tokens ):
16
+ def _unpack (tuples ):
17
+ """Convert list of tuples into the correct format:
18
+
19
+ From:
20
+
21
+ [
22
+ (
23
+ (token0, token1, token2, token3),
24
+ (label0, label1, label2, label3),
25
+ ),
26
+ (
27
+ (token0, token1, token2),
28
+ (label0, label1, label2),
29
+ ),
30
+ )
31
+
32
+ to:
33
+ ]
34
+ (
35
+ (token0, token1, token2, token3),
36
+ (token0, token1, token2),
37
+ ),
38
+ (
39
+ (label0, label1, label2, label3),
40
+ (label0, label1, label2),
41
+ ),
42
+ ]
43
+ """
44
+ return list (zip (* list (tuples )))
45
+
46
+ def _split_list_by_linebreaks (rows ):
17
47
"""Cycle through a list of tokens (or labels) and split them into lists
18
48
based on the presence of Nones or more likely math.nan caused by converting
19
49
pd.DataFrame columns to lists.
20
50
"""
21
51
out = []
22
- tokens_gen = iter (tokens )
52
+ rows_gen = iter (rows )
23
53
while True :
24
54
try :
25
- token = next (tokens_gen )
55
+ row = next (rows_gen )
56
+ token = row [0 ]
26
57
if isinstance (token , str ) and token :
27
- out .append (token )
58
+ out .append (row )
28
59
else :
29
60
yield out
30
61
out = []
@@ -40,10 +71,8 @@ def load_tsv(filepath, split_char="\t"):
40
71
Expects data in the following format (tab separations).
41
72
42
73
References o o
43
- o o
44
74
1 o o
45
75
. o o
46
- o o
47
76
WHO title b-r
48
77
treatment title i-r
49
78
guidelines title i-r
@@ -55,8 +84,6 @@ def load_tsv(filepath, split_char="\t"):
55
84
, title i-r
56
85
2016 title i-r
57
86
58
-
59
-
60
87
Args:
61
88
filepath (str): Path to the data.
62
89
split_char(str): Character to be used to split each line of the
@@ -67,9 +94,16 @@ def load_tsv(filepath, split_char="\t"):
67
94
filepath.
68
95
69
96
"""
70
-
71
97
df = pd .read_csv (filepath , delimiter = split_char , header = None , skip_blank_lines = False )
72
- out = [list (_split_list_by_linebreaks (column )) for _ , column in df .iteritems ()]
98
+ tuples = _split_list_by_linebreaks (df .to_records (index = False ))
99
+
100
+ # Remove leading empty lists if found
101
+
102
+ tuples = list (filter (None , tuples ))
103
+
104
+ unpacked_tuples = list (map (_unpack , tuples ))
105
+
106
+ out = _unpack (unpacked_tuples )
73
107
74
108
logger .info ("Loaded %s training examples" , len (out [0 ]))
75
109
0 commit comments