Skip to content

Commit ef14041

Browse files
authored
Merge pull request #1215 from cmu-delphi/EPID-61-fluview-clinical-queries-return-duplicate-results
Remove duplicates from `fluview_clinical` table
2 parents 866ff00 + e0a0940 commit ef14041

File tree

2 files changed

+116
-1
lines changed

2 files changed

+116
-1
lines changed

src/ddl/fluview.sql

+1-1
Original file line numberDiff line numberDiff line change
@@ -269,8 +269,8 @@ CREATE TABLE `fluview_clinical` (
269269
`percent_a` double DEFAULT NULL,
270270
`percent_b` double DEFAULT NULL,
271271
PRIMARY KEY (`id`),
272+
UNIQUE KEY `issue` (`issue`, `epiweek`, `region`),
272273
KEY `release_date` (`release_date`),
273-
KEY `issue` (`issue`),
274274
KEY `epiweek` (`epiweek`),
275275
KEY `region` (`region`),
276276
KEY `lag` (`lag`)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
USE epidata;
2+
3+
-- Create new `fluview_clinical` table with proper unique constraint.
4+
CREATE TABLE `fluview_clinical_v2` (
5+
`id` int(11) NOT NULL AUTO_INCREMENT,
6+
`release_date` date NOT NULL,
7+
`issue` int(11) NOT NULL,
8+
`epiweek` int(11) NOT NULL,
9+
`region` varchar(12) NOT NULL,
10+
`lag` int(11) NOT NULL,
11+
`total_specimens` int(11) NOT NULL,
12+
`total_a` int(11) DEFAULT NULL,
13+
`total_b` int(11) DEFAULT NULL,
14+
`percent_positive` double DEFAULT NULL,
15+
`percent_a` double DEFAULT NULL,
16+
`percent_b` double DEFAULT NULL,
17+
PRIMARY KEY (`id`),
18+
UNIQUE KEY `issue` (`issue`, `epiweek`, `region`),
19+
KEY `release_date` (`release_date`),
20+
KEY `epiweek` (`epiweek`),
21+
KEY `region` (`region`),
22+
KEY `lag` (`lag`)
23+
) ENGINE = InnoDB DEFAULT CHARSET = utf8;
24+
25+
26+
-- Insert unique rows from `fluview_clinical` into `fluview_clinical_v2`.
27+
-- This is done in order to reset ID counter and fill gaps betwen row's ids.
28+
INSERT INTO
29+
fluview_clinical_v2(
30+
`release_date`,
31+
`issue`,
32+
`epiweek`,
33+
`region`,
34+
`lag`,
35+
`total_specimens`,
36+
`total_a`,
37+
`total_b`,
38+
`percent_positive`,
39+
`percent_a`,
40+
`percent_b`
41+
)
42+
SELECT
43+
min_release_date release_date,
44+
tmp.issue,
45+
tmp.epiweek,
46+
tmp.region,
47+
tmp.lag,
48+
tmp.total_specimens,
49+
tmp.total_a,
50+
tmp.total_b,
51+
tmp.percent_positive,
52+
tmp.percent_a,
53+
tmp.percent_b
54+
FROM
55+
(
56+
-- get data associated with the most recent `release_date` for each unique `(epiweek, issue, region)` key
57+
SELECT
58+
s.release_date,
59+
s.issue,
60+
s.epiweek,
61+
s.region,
62+
s.lag,
63+
s.total_specimens,
64+
s.total_a,
65+
s.total_b,
66+
s.percent_positive,
67+
s.percent_a,
68+
s.percent_b
69+
FROM
70+
(
71+
SELECT
72+
fc.release_date,
73+
fc.issue,
74+
fc.epiweek,
75+
fc.region,
76+
fc.lag,
77+
fc.total_specimens,
78+
fc.total_a,
79+
fc.total_b,
80+
fc.percent_positive,
81+
fc.percent_a,
82+
fc.percent_b,
83+
ROW_NUMBER() OVER(
84+
PARTITION BY fc.epiweek,
85+
fc.issue,
86+
fc.region
87+
ORDER BY
88+
fc.release_date DESC
89+
) as row_num
90+
FROM
91+
fluview_clinical fc
92+
) s
93+
WHERE
94+
s.row_num = 1
95+
) tmp
96+
JOIN (
97+
-- JOIN to recover first/least `release_date` because thats what the acquisition process does: https://github.com/cmu-delphi/delphi-epidata/blob/7fd20cd5c34b33c2310be67867b46a91aa840be9/src/acquisition/fluview/fluview_update.py#L326
98+
SELECT
99+
MIN(fc.release_date) as min_release_date,
100+
fc.issue,
101+
fc.epiweek,
102+
fc.region
103+
FROM
104+
fluview_clinical fc
105+
GROUP BY
106+
fc.issue,
107+
fc.epiweek,
108+
fc.region
109+
) rel_date ON tmp.issue = rel_date.issue
110+
AND tmp.epiweek = rel_date.epiweek
111+
AND tmp.region = rel_date.region;
112+
113+
DROP TABLE fluview_clinical;
114+
115+
ALTER TABLE fluview_clinical_v2 RENAME fluview_clinical;

0 commit comments

Comments
 (0)