Skip to content

Commit 3058da0

Browse files
committed
Fix 2 bugs in bead finding related to dates and sampling
After updating to pandas 1.1.0 the bead location summary plots broke. These matplotlib timeseries plots used to be able to take pandas DateTime objects directly. After the update it became necessary to explicitly convert to matplotlib dates first with matplotlib.dates.date2num. When bead finding input data is above the user specified event limit it will be randomly sampled to fit with the limit. This rearranges the dataframe index, which had unexpected effects during rough filtering because of mark_noise and mark_saturated would return Series with new indexes. Subsequent comparison with the original dataframe would align along the indexes which now did not match. Fixed this by 1) resetting the index after sampling and 2) return ndarrays from mark_noise mark_saturated so that index alignemnt won't happen anyway. When comparing pandas Series, be aware of your indexes! e.g. pandas-dev/pandas#19855
1 parent 7091397 commit 3058da0

File tree

3 files changed

+14
-11
lines changed

3 files changed

+14
-11
lines changed

src/seaflowpy/beads.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -928,6 +928,8 @@ def plot_cruise(bead_df, outpath, filter_params_path="", cruise="", iqr=None):
928928

929929

930930
def plot_column(ax, bead_df, col, filt_df=None, cruise="", iqr=None):
931+
bead_df = bead_df.copy()
932+
bead_df["date"] = mdates.date2num(bead_df["date"]) # for better matplotlib compat
931933
locator = mdates.AutoDateLocator(minticks=3, maxticks=7)
932934
formatter = mdates.ConciseDateFormatter(locator)
933935
ylims = (0, 2**16)
@@ -960,9 +962,9 @@ def plot_column(ax, bead_df, col, filt_df=None, cruise="", iqr=None):
960962
# Plot a vertical line for each bead coord point showing interquartile range
961963
iqr_minys = bead_df[f"{col}_1Q"]
962964
iqr_maxys = bead_df[f"{col}_3Q"]
963-
iqr_xs = mdates.date2num(bead_df["date"])
965+
iqr_xs = bead_df["date"]
964966
iqr_colors = []
965-
for i, x in enumerate(iqr_xs):
967+
for i, _ in enumerate(iqr_xs):
966968
if iqr and bead_df.loc[i, f"{col}_IQR"] > iqr:
967969
iqr_colors.append(mpl.colors.to_rgba("red"))
968970
else:

src/seaflowpy/cli/commands/evt_cmd.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ def beads_evt_cmd(cruise, cytograms, event_limit, frac, iqr, min_date,
215215
tmp_df = group.reset_index(drop=True)
216216
logging.info("clustering %s (%d events)", str(name), len(group))
217217
else:
218-
tmp_df = group.reset_index(drop=True).sample(n=event_limit, random_state=12345)
218+
tmp_df = group.reset_index(drop=True).sample(n=event_limit, random_state=12345).reset_index(drop=True)
219219
logging.info("clustering %s (%d events reduced to %d)", str(name), len(group), len(tmp_df))
220220
try:
221221
results = beads.find_beads(
@@ -249,6 +249,10 @@ def beads_evt_cmd(cruise, cytograms, event_limit, frac, iqr, min_date,
249249
out_df = pd.concat(all_dfs, ignore_index=True)
250250
out_df["resolution"] = resolution
251251
out_df["resolution"] = out_df["resolution"].astype("category")
252+
parquet_path = os.path.join(out_dir, cruise + f".beads-by-{resolution}" + ".parquet")
253+
254+
logging.info("writing bead position parquet %s", parquet_path)
255+
out_df.to_parquet(parquet_path)
252256
logging.info("creating summary plot")
253257
beads.plot_cruise(
254258
out_df,
@@ -257,9 +261,6 @@ def beads_evt_cmd(cruise, cytograms, event_limit, frac, iqr, min_date,
257261
cruise=cruise,
258262
iqr=iqr
259263
)
260-
parquet_path = os.path.join(out_dir, cruise + f".beads-by-{resolution}" + ".parquet")
261-
logging.info("writing bead position parquet " + parquet_path)
262-
out_df.to_parquet(parquet_path)
263264
logging.info("done")
264265

265266

src/seaflowpy/particleops.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -187,14 +187,14 @@ def mark_noise(df):
187187
188188
Parameters
189189
----------
190-
pandas.Series
190+
numpy.ndarray
191191
Boolean array of noise events.
192192
"""
193193
if len(set(list(df)).intersection(set(["D1", "D2", "fsc_small"]))) < 3:
194194
raise ValueError("Can't apply noise filter without D1, D2, and fsc_small")
195195

196196
# Mark noise events in new column "noise"
197-
return pd.Series(~((df["fsc_small"].values > 1) | (df["D1"].values > 1) | (df["D2"].values > 1)))
197+
return ~((df["fsc_small"].values > 1) | (df["D1"].values > 1) | (df["D2"].values > 1))
198198

199199

200200
def mark_saturated(df):
@@ -206,15 +206,15 @@ def mark_saturated(df):
206206
207207
Parameters
208208
----------
209-
pandas.Series
209+
numpy.ndarray
210210
Boolean array of saturated events.
211211
"""
212212
if len(set(list(df)).intersection(set(["D1", "D2"]))) < 2:
213213
raise ValueError("Can't apply saturation filter without D1 and D2")
214214
if len(df.index) == 0:
215-
return pd.Series(np.full(len(df.index), False))
215+
return np.full(len(df.index), False)
216216
else:
217-
return pd.Series((df["D1"].values == df["D1"].values.max()) | (df["D2"].values == df["D2"].values.max()))
217+
return (df["D1"].values == df["D1"].values.max()) | (df["D2"].values == df["D2"].values.max())
218218

219219

220220
def merge_opp_vct(oppdf, vctdf):

0 commit comments

Comments
 (0)