Fix 2 bugs in bead finding related to dates and sampling

ctberthiaume · ctberthiaume · commit 3058da0dcc55 · 2020-09-24T10:24:15.000-07:00
After updating to pandas 1.1.0 the bead location summary plots broke. These matplotlib timeseries plots used to be able to take pandas DateTime objects directly. After the update it became necessary to explicitly convert to matplotlib dates first with matplotlib.dates.date2num. When bead finding input data is above the user specified event limit it will be randomly sampled to fit with the limit. This rearranges the dataframe index, which had unexpected effects during rough filtering because of mark_noise and mark_saturated would return Series with new indexes. Subsequent comparison with the original dataframe would align along the indexes which now did not match. Fixed this by 1) resetting the index after sampling and 2) return ndarrays from mark_noise mark_saturated so that index alignemnt won't happen anyway. When comparing pandas Series, be aware of your indexes! e.g. pandas-dev/pandas#19855
diff --git a/src/seaflowpy/beads.py b/src/seaflowpy/beads.py
@@ -928,6 +928,8 @@ def plot_cruise(bead_df, outpath, filter_params_path="", cruise="", iqr=None):
 
 
 def plot_column(ax, bead_df, col, filt_df=None, cruise="", iqr=None):
+    bead_df = bead_df.copy()
+    bead_df["date"] = mdates.date2num(bead_df["date"])  # for better matplotlib compat
     locator = mdates.AutoDateLocator(minticks=3, maxticks=7)
     formatter = mdates.ConciseDateFormatter(locator)
     ylims = (0, 2**16)
@@ -960,9 +962,9 @@ def plot_column(ax, bead_df, col, filt_df=None, cruise="", iqr=None):
     # Plot a vertical line for each bead coord point showing interquartile range
     iqr_minys = bead_df[f"{col}_1Q"]
     iqr_maxys = bead_df[f"{col}_3Q"]
-    iqr_xs = mdates.date2num(bead_df["date"])
+    iqr_xs = bead_df["date"]
     iqr_colors = []
-    for i, x in enumerate(iqr_xs):
+    for i, _ in enumerate(iqr_xs):
         if iqr and bead_df.loc[i, f"{col}_IQR"] > iqr:
             iqr_colors.append(mpl.colors.to_rgba("red"))
         else:
diff --git a/src/seaflowpy/cli/commands/evt_cmd.py b/src/seaflowpy/cli/commands/evt_cmd.py
@@ -215,7 +215,7 @@ def beads_evt_cmd(cruise, cytograms, event_limit, frac, iqr, min_date,
             tmp_df = group.reset_index(drop=True)
             logging.info("clustering %s (%d events)", str(name), len(group))
         else:
-            tmp_df = group.reset_index(drop=True).sample(n=event_limit, random_state=12345)
+            tmp_df = group.reset_index(drop=True).sample(n=event_limit, random_state=12345).reset_index(drop=True)
             logging.info("clustering %s (%d events reduced to %d)", str(name), len(group), len(tmp_df))
         try:
             results = beads.find_beads(
@@ -249,6 +249,10 @@ def beads_evt_cmd(cruise, cytograms, event_limit, frac, iqr, min_date,
         out_df = pd.concat(all_dfs, ignore_index=True)
         out_df["resolution"] = resolution
         out_df["resolution"] = out_df["resolution"].astype("category")
+        parquet_path = os.path.join(out_dir, cruise + f".beads-by-{resolution}" + ".parquet")
+
+        logging.info("writing bead position parquet %s", parquet_path)
+        out_df.to_parquet(parquet_path)
         logging.info("creating summary plot")
         beads.plot_cruise(
             out_df,
@@ -257,9 +261,6 @@ def beads_evt_cmd(cruise, cytograms, event_limit, frac, iqr, min_date,
             cruise=cruise,
             iqr=iqr
         )
-        parquet_path = os.path.join(out_dir, cruise + f".beads-by-{resolution}" + ".parquet")
-        logging.info("writing bead position parquet " + parquet_path)
-        out_df.to_parquet(parquet_path)
     logging.info("done")
 
 
diff --git a/src/seaflowpy/particleops.py b/src/seaflowpy/particleops.py
@@ -187,14 +187,14 @@ def mark_noise(df):
 
     Parameters
     ----------
-    pandas.Series
+    numpy.ndarray
         Boolean array of noise events.
     """
     if len(set(list(df)).intersection(set(["D1", "D2", "fsc_small"]))) < 3:
         raise ValueError("Can't apply noise filter without D1, D2, and fsc_small")
 
     # Mark noise events in new column "noise"
-    return pd.Series(~((df["fsc_small"].values > 1) | (df["D1"].values > 1) | (df["D2"].values > 1)))
+    return ~((df["fsc_small"].values > 1) | (df["D1"].values > 1) | (df["D2"].values > 1))
 
 
 def mark_saturated(df):
@@ -206,15 +206,15 @@ def mark_saturated(df):
 
     Parameters
     ----------
-    pandas.Series
+    numpy.ndarray
         Boolean array of saturated events.
     """
     if len(set(list(df)).intersection(set(["D1", "D2"]))) < 2:
         raise ValueError("Can't apply saturation filter without D1 and D2")
     if len(df.index) == 0:
-        return pd.Series(np.full(len(df.index), False))
+        return np.full(len(df.index), False)
     else:
-        return pd.Series((df["D1"].values == df["D1"].values.max()) | (df["D2"].values == df["D2"].values.max()))
+        return (df["D1"].values == df["D1"].values.max()) | (df["D2"].values == df["D2"].values.max())
 
 
 def merge_opp_vct(oppdf, vctdf):