Merge pull request #233 from ZachEichen/io_tweaks

frreiss · web-flow · commit 398e1ee42def · 2021-07-26T13:43:48.000-07:00
MINOR: add additional flexibility to a few functions within tp.io
diff --git a/text_extensions_for_pandas/io/bert.py b/text_extensions_for_pandas/io/bert.py
@@ -189,43 +189,53 @@ def conll_to_bert(df: pd.DataFrame, tokenizer: Any, bert: Any,
 
 
 def align_bert_tokens_to_corpus_tokens(
-        spans_df: pd.DataFrame, corpus_toks_df: pd.DataFrame) -> pd.DataFrame:
+        spans_df: pd.DataFrame, corpus_toks_df: pd.DataFrame,
+        spans_df_token_col:str='span',corpus_df_token_col:str='span',
+        entity_type_col:str='ent_type') -> pd.DataFrame:
     """
     Expand entity matches from a BERT-based model so that they align
     with the corpus's original tokenization.
 
     :param spans_df: DataFrame of extracted entities. Must contain two
-     columns: "span" and "ent_type". Other columns ignored.
+     columns with span and entity type information respecitvely. Other columns ignored.
     :param corpus_toks_df: DataFrame of the corpus's original tokenization,
      one row per token.
-     Must contain a column "span" with character-based spans of
+     Must contain a column with character-based spans of
      the tokens.
+    :param spans_df_token_col: the name of the column in `spans_df`
+     containing its tokenization. By default, `'span'`
+    :param corpus_df_token_col: the name of the column in `corpus_toks_df`
+     that contains its tokenization. By default `'span'`
+    :param entity_type_col: the name of the column in spans_df that
+     contains the entity types of the elements
+
 
     :returns: A new DataFrame with schema ["span", "ent_type"],
      where the "span" column contains token-based spans based off
      the *corpus* tokenization in `corpus_toks_df["span"]`.
     """
     if len(spans_df.index) == 0:
         return spans_df.copy()
+    
     overlaps_df = (
         spanner
-            .overlap_join(spans_df["span"], corpus_toks_df["span"],
+            .overlap_join(spans_df[spans_df_token_col], corpus_toks_df[corpus_df_token_col],
                           "span", "corpus_token")
-            .merge(spans_df)
+            .merge(spans_df,left_on='span',right_on=spans_df_token_col)
     )
     agg_df = (
         overlaps_df
             .groupby("span")
-            .aggregate({"corpus_token": "sum", "ent_type": "first"})
+            .aggregate({"corpus_token": "sum", entity_type_col: "first"})
             .reset_index()
     )
     cons_df = (
         spanner.consolidate(agg_df, "corpus_token")
-        [["corpus_token", "ent_type"]]
+        [["corpus_token", entity_type_col]]
             .rename(columns={"corpus_token": "span"})
     )
     cons_df["span"] = TokenSpanArray.align_to_tokens(
-        corpus_toks_df["span"], cons_df["span"])
+        corpus_toks_df[corpus_df_token_col], cons_df["span"])
     return cons_df
 
 
diff --git a/text_extensions_for_pandas/io/conll.py b/text_extensions_for_pandas/io/conll.py
@@ -361,6 +361,7 @@ def _parse_conll_u_file(
     merge_subtokens: bool = False,
     merge_subtoken_separator: str = "|",
     metadata_fields: Dict[str, str] = _DEFAULT_EWT_METADATA,
+    doc_seperator = _EWT_DOC_SEPERATOR
 ) -> List[List[_SentenceData]]:
     """
 
@@ -424,8 +425,8 @@ def _parse_conll_u_file(
                 current_sentence.set_batch_conll_u_metadata(u_metadata)
         elif line[0] == "#":
             line_elems = line.split(" = ")
-            if line_elems[0] == _EWT_DOC_SEPERATOR:
-                if i > 0:
+            if line_elems[0] == doc_seperator:
+                if i > 0 and len(sentences) > 0 :
                     # End of document.  Wrap up this document and start a new one.
                     #
                     docs.append(sentences)
@@ -664,8 +665,8 @@ def _doc_to_df(
     sentence_ends_list = []  # Type: List[np.ndarray]
 
     # conll_u metadata information.
-    conll_u_ids_exsist = doc is not None and doc[0].has_conll_u_metadata
-    conll_2009_format = doc is not None and doc[0]._conll_09_format
+    conll_u_ids_exsist = doc is not None and len(doc)!=0 and doc[0].has_conll_u_metadata
+    conll_2009_format  = doc is not None and len(doc)!=0 and doc[0]._conll_09_format
     # this should be the same for all sentences so we check the first
 
     if conll_2009_format:
@@ -1053,6 +1054,7 @@ def conll_u_to_dataframes(
     merge_subtoken_separator: str = "|",
     numeric_cols: List[str] = _DEFAULT_CONLL_U_NUMERIC_COLS,
     metadata_fields: Dict[str, str] = _DEFAULT_EWT_METADATA,
+    separate_sentences_by_doc = False
 ) -> List[pd.DataFrame]:
     """
     Parses a file from
@@ -1094,6 +1096,9 @@ def conll_u_to_dataframes(
     if iob_columns is None:
         iob_columns = [False for i in range(len(column_names))]
         # fill with falses if not specified
+    
+    # 
+    split_doc_by = "# text" if separate_sentences_by_doc else _EWT_DOC_SEPERATOR
 
     parsed_docs = _parse_conll_u_file(
         input_file,
@@ -1103,6 +1108,7 @@ def conll_u_to_dataframes(
         merge_subtokens=merge_subtokens,
         merge_subtoken_separator=merge_subtoken_separator,
         metadata_fields=metadata_fields,
+        doc_seperator=split_doc_by
     )
     doc_dfs = [
         _doc_to_df(d, column_names, iob_columns, space_before_punct, conll_u=True)
diff --git a/text_extensions_for_pandas/io/test_bert.py b/text_extensions_for_pandas/io/test_bert.py
@@ -198,6 +198,31 @@ def test_conll_to_bert(self):
             3  [15, 22): 'Failure'      PER
             4        [23, 24): '('     <NA>"""))
 
+            ## test with renamed fields
+        without_embeddings_alt = without_embeddings.rename(columns={
+                                                            'span':'span-1',
+                                                            'ent_type':'ent_type-1'})
+        first_df_alt = first_df.rename(columns={'span':'span-2'})
+        aligned_toks_alt = align_bert_tokens_to_corpus_tokens(without_embeddings_alt, 
+                                                        first_df_alt,
+                                                        spans_df_token_col='span-1',
+                                                        corpus_df_token_col='span-2',
+                                                        entity_type_col='ent_type-1'
+                                                        )
+        print(str(aligned_toks_alt.iloc[:num_rows]))                                          
+        self.assertEqual(
+            str(aligned_toks_alt.iloc[:num_rows]),
+            # NOTE: Don't forget to add both sets of double-backslashes back in if you
+            # copy-and-paste an updated version of the output below!
+            textwrap.dedent("""\
+                                  span ent_type-1
+                0        [0, 3): 'Who'       <NA>
+                1         [4, 6): 'is'       <NA>
+                2   [7, 14): 'General'        PER
+                3  [15, 22): 'Failure'        PER
+                4        [23, 24): '('       <NA>"""))
+
+
     def test_seq_to_windows(self):
         for seqlen in range(1, 20):
             seq = np.arange(1, seqlen)
diff --git a/text_extensions_for_pandas/io/test_conll.py b/text_extensions_for_pandas/io/test_conll.py
@@ -249,6 +249,114 @@ def test_conll_2003_to_dataframes_multi_field(self):
             ),
         )
 
+    def test_conll_u_to_dataframes_split_by_sent(self):
+        dfs = conll_u_to_dataframes("test_data/io/test_conll/conll_u_test1.txt"
+                                ,separate_sentences_by_doc=True)
+        self.maxDiff = None
+        print(f"***{repr(dfs[4].drop(columns=['sentence_id','doc_id','paragraph_id']))}***")
+        self.assertEqual(
+            repr(dfs[4].drop(columns=['sentence_id','doc_id','paragraph_id'])),
+            # NOTE the escaped backslash in the string below. Be sure to put it back
+            # in when regenerating this string!
+            textwrap.dedent("""\
+                                       span      lemma upostag xpostag  \\
+                0            [0, 4): 'Bush'       Bush   PROPN     NNP   
+                1            [5, 9): 'also'       also     ADV      RB   
+                2     [10, 19): 'nominated'   nominate    VERB     VBD   
+                3            [20, 22): 'A.'         A.   PROPN     NNP   
+                4          [23, 27): 'Noel'       Noel   PROPN     NNP   
+                5      [28, 36): 'Anketell'   Anketell   PROPN     NNP   
+                6        [37, 43): 'Kramer'     Kramer   PROPN     NNP   
+                7           [44, 47): 'for'        for     ADP      IN   
+                8             [48, 49): 'a'          a     DET      DT   
+                9            [50, 52): '15'         15     NUM      CD   
+                10            [52, 53): '-'          -   PUNCT    HYPH   
+                11         [54, 58): 'year'       year    NOUN      NN   
+                12         [59, 63): 'term'       term    NOUN      NN   
+                13           [64, 66): 'as'         as     ADP      IN   
+                14    [67, 76): 'associate'  associate     ADJ      JJ   
+                15        [77, 82): 'judge'      judge    NOUN      NN   
+                16           [83, 85): 'of'         of     ADP      IN   
+                17          [86, 89): 'the'        the     DET      DT   
+                18     [90, 98): 'District'   District   PROPN     NNP   
+                19          [99, 101): 'of'         of     ADP      IN   
+                20   [102, 110): 'Columbia'   Columbia   PROPN     NNP   
+                21      [111, 116): 'Court'      Court   PROPN     NNP   
+                22         [117, 119): 'of'         of     ADP      IN   
+                23    [120, 127): 'Appeals'     Appeal   PROPN    NNPS   
+                24          [127, 128): ','          ,   PUNCT       ,   
+                25  [129, 138): 'replacing'    replace    VERB     VBG   
+                26       [139, 143): 'John'       John   PROPN     NNP   
+                27   [144, 152): 'Montague'   Montague   PROPN     NNP   
+                28   [153, 161): 'Steadman'   Steadman   PROPN     NNP   
+                29          [161, 162): '.'          .   PUNCT       .   
+
+                                            features  head    deprel         deps  \\
+                0                        Number=Sing     2     nsubj      3:nsubj   
+                1                               None     2    advmod     3:advmod   
+                2   Mood=Ind|Tense=Past|VerbForm=Fin  <NA>      root       0:root   
+                3                        Number=Sing     2       obj        3:obj   
+                4                        Number=Sing     3      flat       4:flat   
+                5                        Number=Sing     3      flat       4:flat   
+                6                        Number=Sing     3      flat       4:flat   
+                7                               None    12      case      13:case   
+                8          Definite=Ind|PronType=Art    12       det       13:det   
+                9                       NumType=Card    11    nummod    12:nummod   
+                10                              None    11     punct     12:punct   
+                11                       Number=Sing    12  compound  13:compound   
+                12                       Number=Sing     2       obl    3:obl:for   
+                13                              None    15      case      16:case   
+                14                        Degree=Pos    15      amod      16:amod   
+                15                       Number=Sing    12      nmod   13:nmod:as   
+                16                              None    18      case      19:case   
+                17         Definite=Def|PronType=Art    18       det       19:det   
+                18                       Number=Sing    15      nmod   16:nmod:of   
+                19                              None    21      case      22:case   
+                20                       Number=Sing    21  compound  22:compound   
+                21                       Number=Sing    18      nmod   19:nmod:of   
+                22                              None    23      case      24:case   
+                23                       Number=Plur    21      nmod   22:nmod:of   
+                24                              None     2     punct      3:punct   
+                25                      VerbForm=Ger     2     advcl      3:advcl   
+                26                       Number=Sing    25       obj       26:obj   
+                27                       Number=Sing    26      flat      27:flat   
+                28                       Number=Sing    26      flat      27:flat   
+                29                              None     2     punct      3:punct   
+
+                             misc                                           sentence  line_num  
+                0            None  [0, 162): 'Bush also nominated A. Noel Anketel...        73  
+                1            None  [0, 162): 'Bush also nominated A. Noel Anketel...        74  
+                2            None  [0, 162): 'Bush also nominated A. Noel Anketel...        75  
+                3            None  [0, 162): 'Bush also nominated A. Noel Anketel...        76  
+                4            None  [0, 162): 'Bush also nominated A. Noel Anketel...        77  
+                5            None  [0, 162): 'Bush also nominated A. Noel Anketel...        78  
+                6            None  [0, 162): 'Bush also nominated A. Noel Anketel...        79  
+                7            None  [0, 162): 'Bush also nominated A. Noel Anketel...        80  
+                8            None  [0, 162): 'Bush also nominated A. Noel Anketel...        81  
+                9   SpaceAfter=No  [0, 162): 'Bush also nominated A. Noel Anketel...        82  
+                10  SpaceAfter=No  [0, 162): 'Bush also nominated A. Noel Anketel...        83  
+                11           None  [0, 162): 'Bush also nominated A. Noel Anketel...        84  
+                12           None  [0, 162): 'Bush also nominated A. Noel Anketel...        85  
+                13           None  [0, 162): 'Bush also nominated A. Noel Anketel...        86  
+                14           None  [0, 162): 'Bush also nominated A. Noel Anketel...        87  
+                15           None  [0, 162): 'Bush also nominated A. Noel Anketel...        88  
+                16           None  [0, 162): 'Bush also nominated A. Noel Anketel...        89  
+                17           None  [0, 162): 'Bush also nominated A. Noel Anketel...        90  
+                18           None  [0, 162): 'Bush also nominated A. Noel Anketel...        91  
+                19           None  [0, 162): 'Bush also nominated A. Noel Anketel...        92  
+                20           None  [0, 162): 'Bush also nominated A. Noel Anketel...        93  
+                21           None  [0, 162): 'Bush also nominated A. Noel Anketel...        94  
+                22           None  [0, 162): 'Bush also nominated A. Noel Anketel...        95  
+                23  SpaceAfter=No  [0, 162): 'Bush also nominated A. Noel Anketel...        96  
+                24           None  [0, 162): 'Bush also nominated A. Noel Anketel...        97  
+                25           None  [0, 162): 'Bush also nominated A. Noel Anketel...        98  
+                26           None  [0, 162): 'Bush also nominated A. Noel Anketel...        99  
+                27           None  [0, 162): 'Bush also nominated A. Noel Anketel...       100  
+                28  SpaceAfter=No  [0, 162): 'Bush also nominated A. Noel Anketel...       101  
+                29           None  [0, 162): 'Bush also nominated A. Noel Anketel...       102  """))
+
+
+
     def test_conll_u_to_dataframes(self):
         dfs = conll_u_to_dataframes("test_data/io/test_conll/conll_u_test1.txt")
         self.maxDiff = None