13
13
14
14
from ..logger import logger
15
15
16
- def _unpack (tuples ):
17
- """Convert list of tuples into the correct format:
18
-
19
- From:
20
-
21
- [
22
- (
23
- (token0, token1, token2, token3),
24
- (label0, label1, label2, label3),
25
- ),
26
- (
27
- (token0, token1, token2),
28
- (label0, label1, label2),
29
- ),
30
- )
31
-
32
- to:
33
- ]
34
- (
35
- (token0, token1, token2, token3),
36
- (token0, token1, token2),
37
- ),
38
- (
39
- (label0, label1, label2, label3),
40
- (label0, label1, label2),
41
- ),
42
- ]
43
- """
44
- return list (zip (* list (tuples )))
45
-
46
- def _split_list_by_linebreaks (rows ):
16
+ def _split_list_by_linebreaks (tokens ):
47
17
"""Cycle through a list of tokens (or labels) and split them into lists
48
18
based on the presence of Nones or more likely math.nan caused by converting
49
19
pd.DataFrame columns to lists.
50
20
"""
51
21
out = []
52
- rows_gen = iter (rows )
22
+ tokens_gen = iter (tokens )
53
23
while True :
54
24
try :
55
- row = next (rows_gen )
56
- token = row [0 ]
25
+ token = next (tokens_gen )
57
26
if isinstance (token , str ) and token :
58
- out .append (row )
27
+ out .append (token )
59
28
else :
60
29
yield out
61
30
out = []
@@ -71,8 +40,10 @@ def load_tsv(filepath, split_char="\t"):
71
40
Expects data in the following format (tab separations).
72
41
73
42
References o o
43
+ o o
74
44
1 o o
75
45
. o o
46
+ o o
76
47
WHO title b-r
77
48
treatment title i-r
78
49
guidelines title i-r
@@ -84,6 +55,8 @@ def load_tsv(filepath, split_char="\t"):
84
55
, title i-r
85
56
2016 title i-r
86
57
58
+
59
+
87
60
Args:
88
61
filepath (str): Path to the data.
89
62
split_char(str): Character to be used to split each line of the
@@ -94,16 +67,9 @@ def load_tsv(filepath, split_char="\t"):
94
67
filepath.
95
68
96
69
"""
97
- df = pd .read_csv (filepath , delimiter = split_char , header = None , skip_blank_lines = False )
98
- tuples = _split_list_by_linebreaks (df .to_records (index = False ))
99
70
100
- # Remove leading empty lists if found
101
-
102
- tuples = list (filter (None , tuples ))
103
-
104
- unpacked_tuples = list (map (_unpack , tuples ))
105
-
106
- out = _unpack (unpacked_tuples )
71
+ df = pd .read_csv (filepath , delimiter = split_char , header = None , skip_blank_lines = False )
72
+ out = [list (_split_list_by_linebreaks (column )) for _ , column in df .iteritems ()]
107
73
108
74
logger .info ("Loaded %s training examples" , len (out [0 ]))
109
75
0 commit comments