Skip to content

Commit 398e1ee

Browse files
authored
Merge pull request #233 from ZachEichen/io_tweaks
MINOR: add additional flexibility to a few functions within tp.io
2 parents 57638f5 + a568fdf commit 398e1ee

File tree

4 files changed

+161
-12
lines changed

4 files changed

+161
-12
lines changed

text_extensions_for_pandas/io/bert.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -189,43 +189,53 @@ def conll_to_bert(df: pd.DataFrame, tokenizer: Any, bert: Any,
189189

190190

191191
def align_bert_tokens_to_corpus_tokens(
192-
spans_df: pd.DataFrame, corpus_toks_df: pd.DataFrame) -> pd.DataFrame:
192+
spans_df: pd.DataFrame, corpus_toks_df: pd.DataFrame,
193+
spans_df_token_col:str='span',corpus_df_token_col:str='span',
194+
entity_type_col:str='ent_type') -> pd.DataFrame:
193195
"""
194196
Expand entity matches from a BERT-based model so that they align
195197
with the corpus's original tokenization.
196198
197199
:param spans_df: DataFrame of extracted entities. Must contain two
198-
columns: "span" and "ent_type". Other columns ignored.
200+
columns with span and entity type information respecitvely. Other columns ignored.
199201
:param corpus_toks_df: DataFrame of the corpus's original tokenization,
200202
one row per token.
201-
Must contain a column "span" with character-based spans of
203+
Must contain a column with character-based spans of
202204
the tokens.
205+
:param spans_df_token_col: the name of the column in `spans_df`
206+
containing its tokenization. By default, `'span'`
207+
:param corpus_df_token_col: the name of the column in `corpus_toks_df`
208+
that contains its tokenization. By default `'span'`
209+
:param entity_type_col: the name of the column in spans_df that
210+
contains the entity types of the elements
211+
203212
204213
:returns: A new DataFrame with schema ["span", "ent_type"],
205214
where the "span" column contains token-based spans based off
206215
the *corpus* tokenization in `corpus_toks_df["span"]`.
207216
"""
208217
if len(spans_df.index) == 0:
209218
return spans_df.copy()
219+
210220
overlaps_df = (
211221
spanner
212-
.overlap_join(spans_df["span"], corpus_toks_df["span"],
222+
.overlap_join(spans_df[spans_df_token_col], corpus_toks_df[corpus_df_token_col],
213223
"span", "corpus_token")
214-
.merge(spans_df)
224+
.merge(spans_df,left_on='span',right_on=spans_df_token_col)
215225
)
216226
agg_df = (
217227
overlaps_df
218228
.groupby("span")
219-
.aggregate({"corpus_token": "sum", "ent_type": "first"})
229+
.aggregate({"corpus_token": "sum", entity_type_col: "first"})
220230
.reset_index()
221231
)
222232
cons_df = (
223233
spanner.consolidate(agg_df, "corpus_token")
224-
[["corpus_token", "ent_type"]]
234+
[["corpus_token", entity_type_col]]
225235
.rename(columns={"corpus_token": "span"})
226236
)
227237
cons_df["span"] = TokenSpanArray.align_to_tokens(
228-
corpus_toks_df["span"], cons_df["span"])
238+
corpus_toks_df[corpus_df_token_col], cons_df["span"])
229239
return cons_df
230240

231241

text_extensions_for_pandas/io/conll.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,7 @@ def _parse_conll_u_file(
361361
merge_subtokens: bool = False,
362362
merge_subtoken_separator: str = "|",
363363
metadata_fields: Dict[str, str] = _DEFAULT_EWT_METADATA,
364+
doc_seperator = _EWT_DOC_SEPERATOR
364365
) -> List[List[_SentenceData]]:
365366
"""
366367
@@ -424,8 +425,8 @@ def _parse_conll_u_file(
424425
current_sentence.set_batch_conll_u_metadata(u_metadata)
425426
elif line[0] == "#":
426427
line_elems = line.split(" = ")
427-
if line_elems[0] == _EWT_DOC_SEPERATOR:
428-
if i > 0:
428+
if line_elems[0] == doc_seperator:
429+
if i > 0 and len(sentences) > 0 :
429430
# End of document. Wrap up this document and start a new one.
430431
#
431432
docs.append(sentences)
@@ -664,8 +665,8 @@ def _doc_to_df(
664665
sentence_ends_list = [] # Type: List[np.ndarray]
665666

666667
# conll_u metadata information.
667-
conll_u_ids_exsist = doc is not None and doc[0].has_conll_u_metadata
668-
conll_2009_format = doc is not None and doc[0]._conll_09_format
668+
conll_u_ids_exsist = doc is not None and len(doc)!=0 and doc[0].has_conll_u_metadata
669+
conll_2009_format = doc is not None and len(doc)!=0 and doc[0]._conll_09_format
669670
# this should be the same for all sentences so we check the first
670671

671672
if conll_2009_format:
@@ -1053,6 +1054,7 @@ def conll_u_to_dataframes(
10531054
merge_subtoken_separator: str = "|",
10541055
numeric_cols: List[str] = _DEFAULT_CONLL_U_NUMERIC_COLS,
10551056
metadata_fields: Dict[str, str] = _DEFAULT_EWT_METADATA,
1057+
separate_sentences_by_doc = False
10561058
) -> List[pd.DataFrame]:
10571059
"""
10581060
Parses a file from
@@ -1094,6 +1096,9 @@ def conll_u_to_dataframes(
10941096
if iob_columns is None:
10951097
iob_columns = [False for i in range(len(column_names))]
10961098
# fill with falses if not specified
1099+
1100+
#
1101+
split_doc_by = "# text" if separate_sentences_by_doc else _EWT_DOC_SEPERATOR
10971102

10981103
parsed_docs = _parse_conll_u_file(
10991104
input_file,
@@ -1103,6 +1108,7 @@ def conll_u_to_dataframes(
11031108
merge_subtokens=merge_subtokens,
11041109
merge_subtoken_separator=merge_subtoken_separator,
11051110
metadata_fields=metadata_fields,
1111+
doc_seperator=split_doc_by
11061112
)
11071113
doc_dfs = [
11081114
_doc_to_df(d, column_names, iob_columns, space_before_punct, conll_u=True)

text_extensions_for_pandas/io/test_bert.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,31 @@ def test_conll_to_bert(self):
198198
3 [15, 22): 'Failure' PER
199199
4 [23, 24): '(' <NA>"""))
200200

201+
## test with renamed fields
202+
without_embeddings_alt = without_embeddings.rename(columns={
203+
'span':'span-1',
204+
'ent_type':'ent_type-1'})
205+
first_df_alt = first_df.rename(columns={'span':'span-2'})
206+
aligned_toks_alt = align_bert_tokens_to_corpus_tokens(without_embeddings_alt,
207+
first_df_alt,
208+
spans_df_token_col='span-1',
209+
corpus_df_token_col='span-2',
210+
entity_type_col='ent_type-1'
211+
)
212+
print(str(aligned_toks_alt.iloc[:num_rows]))
213+
self.assertEqual(
214+
str(aligned_toks_alt.iloc[:num_rows]),
215+
# NOTE: Don't forget to add both sets of double-backslashes back in if you
216+
# copy-and-paste an updated version of the output below!
217+
textwrap.dedent("""\
218+
span ent_type-1
219+
0 [0, 3): 'Who' <NA>
220+
1 [4, 6): 'is' <NA>
221+
2 [7, 14): 'General' PER
222+
3 [15, 22): 'Failure' PER
223+
4 [23, 24): '(' <NA>"""))
224+
225+
201226
def test_seq_to_windows(self):
202227
for seqlen in range(1, 20):
203228
seq = np.arange(1, seqlen)

text_extensions_for_pandas/io/test_conll.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,114 @@ def test_conll_2003_to_dataframes_multi_field(self):
249249
),
250250
)
251251

252+
def test_conll_u_to_dataframes_split_by_sent(self):
253+
dfs = conll_u_to_dataframes("test_data/io/test_conll/conll_u_test1.txt"
254+
,separate_sentences_by_doc=True)
255+
self.maxDiff = None
256+
print(f"***{repr(dfs[4].drop(columns=['sentence_id','doc_id','paragraph_id']))}***")
257+
self.assertEqual(
258+
repr(dfs[4].drop(columns=['sentence_id','doc_id','paragraph_id'])),
259+
# NOTE the escaped backslash in the string below. Be sure to put it back
260+
# in when regenerating this string!
261+
textwrap.dedent("""\
262+
span lemma upostag xpostag \\
263+
0 [0, 4): 'Bush' Bush PROPN NNP
264+
1 [5, 9): 'also' also ADV RB
265+
2 [10, 19): 'nominated' nominate VERB VBD
266+
3 [20, 22): 'A.' A. PROPN NNP
267+
4 [23, 27): 'Noel' Noel PROPN NNP
268+
5 [28, 36): 'Anketell' Anketell PROPN NNP
269+
6 [37, 43): 'Kramer' Kramer PROPN NNP
270+
7 [44, 47): 'for' for ADP IN
271+
8 [48, 49): 'a' a DET DT
272+
9 [50, 52): '15' 15 NUM CD
273+
10 [52, 53): '-' - PUNCT HYPH
274+
11 [54, 58): 'year' year NOUN NN
275+
12 [59, 63): 'term' term NOUN NN
276+
13 [64, 66): 'as' as ADP IN
277+
14 [67, 76): 'associate' associate ADJ JJ
278+
15 [77, 82): 'judge' judge NOUN NN
279+
16 [83, 85): 'of' of ADP IN
280+
17 [86, 89): 'the' the DET DT
281+
18 [90, 98): 'District' District PROPN NNP
282+
19 [99, 101): 'of' of ADP IN
283+
20 [102, 110): 'Columbia' Columbia PROPN NNP
284+
21 [111, 116): 'Court' Court PROPN NNP
285+
22 [117, 119): 'of' of ADP IN
286+
23 [120, 127): 'Appeals' Appeal PROPN NNPS
287+
24 [127, 128): ',' , PUNCT ,
288+
25 [129, 138): 'replacing' replace VERB VBG
289+
26 [139, 143): 'John' John PROPN NNP
290+
27 [144, 152): 'Montague' Montague PROPN NNP
291+
28 [153, 161): 'Steadman' Steadman PROPN NNP
292+
29 [161, 162): '.' . PUNCT .
293+
294+
features head deprel deps \\
295+
0 Number=Sing 2 nsubj 3:nsubj
296+
1 None 2 advmod 3:advmod
297+
2 Mood=Ind|Tense=Past|VerbForm=Fin <NA> root 0:root
298+
3 Number=Sing 2 obj 3:obj
299+
4 Number=Sing 3 flat 4:flat
300+
5 Number=Sing 3 flat 4:flat
301+
6 Number=Sing 3 flat 4:flat
302+
7 None 12 case 13:case
303+
8 Definite=Ind|PronType=Art 12 det 13:det
304+
9 NumType=Card 11 nummod 12:nummod
305+
10 None 11 punct 12:punct
306+
11 Number=Sing 12 compound 13:compound
307+
12 Number=Sing 2 obl 3:obl:for
308+
13 None 15 case 16:case
309+
14 Degree=Pos 15 amod 16:amod
310+
15 Number=Sing 12 nmod 13:nmod:as
311+
16 None 18 case 19:case
312+
17 Definite=Def|PronType=Art 18 det 19:det
313+
18 Number=Sing 15 nmod 16:nmod:of
314+
19 None 21 case 22:case
315+
20 Number=Sing 21 compound 22:compound
316+
21 Number=Sing 18 nmod 19:nmod:of
317+
22 None 23 case 24:case
318+
23 Number=Plur 21 nmod 22:nmod:of
319+
24 None 2 punct 3:punct
320+
25 VerbForm=Ger 2 advcl 3:advcl
321+
26 Number=Sing 25 obj 26:obj
322+
27 Number=Sing 26 flat 27:flat
323+
28 Number=Sing 26 flat 27:flat
324+
29 None 2 punct 3:punct
325+
326+
misc sentence line_num
327+
0 None [0, 162): 'Bush also nominated A. Noel Anketel... 73
328+
1 None [0, 162): 'Bush also nominated A. Noel Anketel... 74
329+
2 None [0, 162): 'Bush also nominated A. Noel Anketel... 75
330+
3 None [0, 162): 'Bush also nominated A. Noel Anketel... 76
331+
4 None [0, 162): 'Bush also nominated A. Noel Anketel... 77
332+
5 None [0, 162): 'Bush also nominated A. Noel Anketel... 78
333+
6 None [0, 162): 'Bush also nominated A. Noel Anketel... 79
334+
7 None [0, 162): 'Bush also nominated A. Noel Anketel... 80
335+
8 None [0, 162): 'Bush also nominated A. Noel Anketel... 81
336+
9 SpaceAfter=No [0, 162): 'Bush also nominated A. Noel Anketel... 82
337+
10 SpaceAfter=No [0, 162): 'Bush also nominated A. Noel Anketel... 83
338+
11 None [0, 162): 'Bush also nominated A. Noel Anketel... 84
339+
12 None [0, 162): 'Bush also nominated A. Noel Anketel... 85
340+
13 None [0, 162): 'Bush also nominated A. Noel Anketel... 86
341+
14 None [0, 162): 'Bush also nominated A. Noel Anketel... 87
342+
15 None [0, 162): 'Bush also nominated A. Noel Anketel... 88
343+
16 None [0, 162): 'Bush also nominated A. Noel Anketel... 89
344+
17 None [0, 162): 'Bush also nominated A. Noel Anketel... 90
345+
18 None [0, 162): 'Bush also nominated A. Noel Anketel... 91
346+
19 None [0, 162): 'Bush also nominated A. Noel Anketel... 92
347+
20 None [0, 162): 'Bush also nominated A. Noel Anketel... 93
348+
21 None [0, 162): 'Bush also nominated A. Noel Anketel... 94
349+
22 None [0, 162): 'Bush also nominated A. Noel Anketel... 95
350+
23 SpaceAfter=No [0, 162): 'Bush also nominated A. Noel Anketel... 96
351+
24 None [0, 162): 'Bush also nominated A. Noel Anketel... 97
352+
25 None [0, 162): 'Bush also nominated A. Noel Anketel... 98
353+
26 None [0, 162): 'Bush also nominated A. Noel Anketel... 99
354+
27 None [0, 162): 'Bush also nominated A. Noel Anketel... 100
355+
28 SpaceAfter=No [0, 162): 'Bush also nominated A. Noel Anketel... 101
356+
29 None [0, 162): 'Bush also nominated A. Noel Anketel... 102 """))
357+
358+
359+
252360
def test_conll_u_to_dataframes(self):
253361
dfs = conll_u_to_dataframes("test_data/io/test_conll/conll_u_test1.txt")
254362
self.maxDiff = None

0 commit comments

Comments
 (0)