Skip to content

Commit b9ea7a3

Browse files
committed
JIT: update Pandas to build a single DataFrame per query
1 parent c98274e commit b9ea7a3

File tree

1 file changed

+25
-31
lines changed
  • src/server/endpoints/covidcast_utils

1 file changed

+25
-31
lines changed

src/server/endpoints/covidcast_utils/model.py

+25-31
Original file line numberDiff line numberDiff line change
@@ -491,7 +491,6 @@ def _generate_transformed_rows(
491491
parsed_rows: Iterator[Dict],
492492
transform_dict: Optional[SignalTransforms] = None,
493493
transform_args: Optional[Dict] = None,
494-
group_keyfunc: Optional[Callable] = None,
495494
) -> Iterator[Dict]:
496495
"""Applies time-series transformations to streamed rows from a database.
497496
@@ -503,9 +502,6 @@ def _generate_transformed_rows(
503502
For example, transform_dict may be {("jhu-csse", "confirmed_cumulative_num): [("jhu-csse", "confirmed_incidence_num"), ("jhu-csse", "confirmed_7dav_incidence_num")]}.
504503
transform_args: Optional[Dict], default None
505504
A dictionary of keyword arguments for the transformer functions.
506-
group_keyfunc: Optional[Callable], default None
507-
The groupby function to use to order the streamed rows. Note that Python groupby does not do any sorting, so
508-
parsed_rows are assumed to be sorted in accord with this groupby.
509505
510506
Yields:
511507
transformed rows: Dict
@@ -515,59 +511,57 @@ def _generate_transformed_rows(
515511
transform_args = dict()
516512
if not transform_dict:
517513
transform_dict = dict()
518-
if not group_keyfunc:
519-
group_keyfunc = lambda row: (row["source"], row["signal"], row["geo_type"], row["geo_value"])
520514

521-
for key, source_signal_geo_rows in groupby(parsed_rows, group_keyfunc):
515+
# TODO: Fix these to come as an argument?
516+
fields_string = ["geo_type", "geo_value", "source", "signal", "time_type"]
517+
fields_int = ["time_value", "direction", "issue", "lag", "missing_value", "missing_stderr", "missing_sample_size"]
518+
fields_float = ["value", "stderr", "sample_size"]
519+
columns = fields_string + fields_int + fields_float
520+
df = pd.DataFrame(parsed_rows, columns=columns)
521+
for key, group_df in df.groupby(["source", "signal", "geo_type", "geo_value"]):
522522
base_source_name, base_signal_name, _, _ = key
523523
# Extract the list of derived signals; if a signal is not in the dictionary, then use the identity map.
524524
derived_signal_transform_map: SourceSignalPair = transform_dict.get(SourceSignalPair(base_source_name, [base_signal_name]), SourceSignalPair(base_source_name, [base_signal_name]))
525525
# Create a list of source-signal pairs along with the transformation required for the signal.
526526
signal_names_and_transforms: List[Tuple[str, Callable]] = [(derived_signal, _get_base_signal_transform((base_source_name, derived_signal))) for derived_signal in derived_signal_transform_map.signal]
527527

528-
# TODO: Fix these to come as an argument.
529-
fields_string = ["geo_type", "geo_value", "source", "signal", "time_type"]
530-
fields_int = ["time_value", "direction", "issue", "lag", "missing_value", "missing_stderr", "missing_sample_size"]
531-
fields_float = ["value", "stderr", "sample_size"]
532-
columns = fields_string + fields_int + fields_float
533-
df = pd.DataFrame.from_records(source_signal_geo_rows, columns=columns)
534528
for derived_signal, transform in signal_names_and_transforms:
535529
if transform == IDENTITY:
536-
yield from df.to_dict(orient="records")
530+
yield from group_df.to_dict(orient="records")
537531
continue
538-
539-
df2 = df.set_index(["time_value"])
540-
df2 = df2.reindex(iterate_over_range(df2.index.min(), df2.index.max(), inclusive=True))
532+
533+
derived_df = group_df.set_index(["time_value"])
534+
derived_df = derived_df.reindex(iterate_over_range(derived_df.index.min(), derived_df.index.max(), inclusive=True))
541535

542536
if transform == DIFF:
543-
df2["value"] = df2["value"].diff()
537+
derived_df["value"] = derived_df["value"].diff()
544538
window_length = 2
545539
elif transform == SMOOTH:
546-
df2["value"] = df2["value"].rolling(7).mean()
540+
derived_df["value"] = derived_df["value"].rolling(7).mean()
547541
window_length = 7
548542
elif transform == DIFF_SMOOTH:
549-
df2["value"] = df2["value"].diff().rolling(7).mean()
543+
derived_df["value"] = derived_df["value"].diff().rolling(7).mean()
550544
window_length = 8
551545
else:
552546
raise ValueError(f"Unknown transform for {derived_signal}.")
553547

554-
df2 = df2.assign(
555-
geo_type = df2["geo_type"].fillna(method="ffill"),
556-
geo_value = df2["geo_value"].fillna(method="ffill"),
557-
source = df2["source"].fillna(method="ffill"),
548+
derived_df = derived_df.assign(
549+
geo_type = derived_df["geo_type"].fillna(method="ffill"),
550+
geo_value = derived_df["geo_value"].fillna(method="ffill"),
551+
source = derived_df["source"].fillna(method="ffill"),
558552
signal = derived_signal,
559-
time_type = df2["time_type"].fillna(method="ffill"),
560-
direction = df2["direction"].fillna(method="ffill"),
561-
issue = df2["issue"].rolling(window_length).max(),
562-
lag = df2["lag"].rolling(window_length).max(),
563-
missing_value=np.where(df2["value"].isna(), Nans.NOT_APPLICABLE, Nans.NOT_MISSING),
553+
time_type = derived_df["time_type"].fillna(method="ffill"),
554+
direction = derived_df["direction"].fillna(method="ffill"),
555+
issue = derived_df["issue"].rolling(window_length).max(),
556+
lag = derived_df["lag"].rolling(window_length).max(),
557+
missing_value=np.where(derived_df["value"].isna(), Nans.NOT_APPLICABLE, Nans.NOT_MISSING),
564558
missing_stderr=Nans.NOT_APPLICABLE,
565559
missing_sample_size=Nans.NOT_APPLICABLE,
566560
stderr=np.nan,
567561
sample_size=np.nan,
568562
)
569-
df2 = df2.iloc[window_length - 1:]
570-
for row in df2.reset_index().to_dict(orient="records"):
563+
derived_df = derived_df.iloc[window_length - 1:]
564+
for row in derived_df.reset_index().to_dict(orient="records"):
571565
row.update({
572566
"issue": int(row["issue"]) if not np.isnan(row["issue"]) else row["issue"],
573567
"lag": int(row["lag"]) if not np.isnan(row["lag"]) else row["lag"]

0 commit comments

Comments
 (0)