Skip to content

Commit d66168d

Browse files
committed
wip panel data
1 parent 172ab1e commit d66168d

File tree

6 files changed

+400
-2
lines changed

6 files changed

+400
-2
lines changed

R/data.R

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,27 @@
3333
#' June 7 is the average of the underlying data for June 1 through 7,
3434
#' inclusive.
3535
"case_death_rate_subset"
36+
37+
#' Subset of Statistics Canada employment numbers by industry and province
38+
#'
39+
#' @format A tibble with 109,388 rows and 6 variables:
40+
#' \describe{
41+
#' \item{geo_value}{The province in Canada associated with each
42+
#' row of measurements.}
43+
#' \item{time_value}{The time value, in YYYY-MM-01 format,
44+
#' associated with each row of measurements.}
45+
#' \item{ppl_count}{The number of people employed, seasonally
46+
#' adjusted.}
47+
#' \item{employee_type}{The type of employee}
48+
#' \item{naics_industry}{The industry name and associated code
49+
#' according to \href{https://www23.statcan.gc.ca/imdb/p3VD.pl?Function=getVD&TVD=1181553}{NAICS}}
50+
#' }
51+
#' @source This object contains modified data from the following Statistics Canada
52+
#' data table: \href{https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1410022001#data}{Table 14-10-0220-01 Employment and average weekly earnings (including overtime) for all employees by industry, monthly, seasonally adjusted, Canada}
53+
#'
54+
#' Modifications:
55+
#' * From the given Statistics Canada table, the employee counts
56+
#' are taken as-is. Only \href{https://www23.statcan.gc.ca/imdb/p3VD.pl?Function=getVD&TVD=1181553}{NAICS} codes at hierarchy level 2 are
57+
#' filtered in. Only data rows that are \href{https://www.statcan.gc.ca/en/concepts/definitions/guide-symbol}{good quality or higher and not missing}
58+
#' according to Statistics Canada are removed.
59+
"statcan_employ_subset"

data-raw/statcan_employ_subset.R

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
library(epipredict)
2+
library(epiprocess)
3+
library(cansim)
4+
library(dplyr)
5+
library(stringr)
6+
7+
# Run this once
8+
# https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1410022001#data
9+
statcan_employ <- get_cansim("14-10-0201-01")
10+
11+
# ================== Subset & Filtering ==================
12+
employ <- statcan_employ %>%
13+
select(c(
14+
"REF_DATE",
15+
"GEO",
16+
# "DGUID",
17+
# "UOM",
18+
# "UOM_ID",
19+
# "SCALAR_FACTOR",
20+
# "SCALAR_ID",
21+
# "VECTOR",
22+
# "COORDINATE",
23+
"VALUE",
24+
"STATUS",
25+
# "SYMBOL",
26+
# "TERMINATED",
27+
# "DECIMALS",
28+
# "GeoUID",
29+
# "Hierarchy for GEO",
30+
# "Classification Code for Type of employee",
31+
# "Hierarchy for Type of employee",
32+
"Classification Code for North American Industry Classification System (NAICS)",
33+
# "Hierarchy for North American Industry Classification System (NAICS)",
34+
# "val_norm",
35+
# "Date",
36+
"Type of employee",
37+
"North American Industry Classification System (NAICS)")) %>%
38+
rename(
39+
"geo_value" = "GEO",
40+
"time_value" = "REF_DATE",
41+
"ppl_count" = "VALUE",
42+
"status" = "STATUS",
43+
"employee_type" = "Type of employee",
44+
"naics_industry" = "North American Industry Classification System (NAICS)",
45+
"naics_code" = "Classification Code for North American Industry Classification System (NAICS)"
46+
) %>%
47+
mutate(time_value = tsibble::yearmonth(time_value, "%Y-%m")) %>%
48+
# https://www.statcan.gc.ca/en/concepts/definitions/guide-symbol
49+
# .. not available for a specific reference period
50+
# x: suppressed to meet the confidentiality requirements of the Statistics Act
51+
# A data quality: excellent
52+
# B data quality: very good
53+
# C data quality: good
54+
# [blank] or A-D: acceptable or better <- only keep these ones
55+
# E use with caution
56+
# F too unreliable to be published
57+
filter(
58+
status %in% c("A", "B", "C", "D", NA) & # only keep "good" data
59+
!is.na(ppl_count) &
60+
geo_value != "Canada" & # only keep provinces
61+
# N corresponds to aggregates
62+
!str_detect(naics_code, "N") &
63+
# only keep top-level sectors
64+
# https://www.census.gov/programs-surveys/economic-census/guidance/understanding-naics.html
65+
# corresponds to either [##] or [##-##]
66+
str_detect(naics_code, "(\\[[0-9]{2}\\])|(\\[[0-9]{2}-[0-9]{2}\\])") &
67+
# [00] corresponds to unclassified
68+
naics_code != "[00]") %>%
69+
select(-c(status, naics_code)) %>%
70+
# re-level the industry column because we dropped factors
71+
mutate(naics_industry = factor(naics_industry))
72+
73+
# head(employ)
74+
# statcan_employ_subset <- employ %>%
75+
# tsibble::as_tsibble(
76+
# index=time_value,
77+
# key=c(geo_value, employee_type, naics_industry)) %>%
78+
# as_epi_df(time_type = "yearmonth", as_of="2022-07-28")
79+
80+
statcan_employ_subset <- employ %>%
81+
tsibble::as_tsibble(index=time_value, key=c(geo_value, employee_type, naics_industry)) %>%
82+
as_epi_df(
83+
additional_metadata=c(other_keys=c("employee_type", "naics_industry")))
84+
85+
usethis::use_data(statcan_employ_subset, overwrite = TRUE)
86+
87+
# ================== EDA ==================
88+
length(statcan_employ$REF_DATE)
89+
names(statcan_employ)
90+
91+
uniq_ref_date <- unique(select(statcan_employ, "REF_DATE"))
92+
uniq_ref_date
93+
min(statcan_employ$REF_DATE) # 2001-01
94+
max(statcan_employ$REF_DATE) # 2022-05
95+
# There should be (22-1)*12 + 5
96+
exp_total_dates <- (22-1)*12+5
97+
length(uniq_ref_date %>% unlist()) == exp_total_dates # TRUE
98+
# There is a ref date for each month in the date range
99+
100+
unique(select(statcan_employ, "GEO")) # List of length 14, names of provinces & territories + Canada
101+
statcan_employ %>% group_by(GEO) %>% top_n(n=-1)
102+
# Dissemination Geography Unique Identifier - DGUID.
103+
# https://www12.statcan.gc.ca/census-recensement/2021/ref/dict/az/Definition-eng.cfm?ID=geo055
104+
# 2016 (year)
105+
# A (administrative)
106+
unique(select(statcan_employ, "DGUID"))
107+
unique(select(statcan_employ, "UOM")) # Persons
108+
unique(select(statcan_employ, "UOM_ID")) # 249
109+
110+
# These scalar columns tell us by what factor of 10 to multiply the values
111+
# We get "units" and 0 so we can ignore these columns and
112+
# take the values in VALUE as-is
113+
unique(select(statcan_employ, "SCALAR_FACTOR")) # All "units"
114+
unique(select(statcan_employ, "SCALAR_ID")) # All 0
115+
116+
# Time series identifier - ignore
117+
unique(select(statcan_employ, "VECTOR"))
118+
119+
# Related to dimension, which is not present in this table - ignore
120+
unique(select(statcan_employ, "COORDINATE"))
121+
122+
# Data value column. Values in units
123+
unique(select(statcan_employ, "VALUE"))
124+
# How many rows have empty values?
125+
# Approx 3/4 of the rows have NA values
126+
statcan_employ %>%
127+
summarise(
128+
VALUE_NA = sum(is.na(VALUE)) / length(VALUE),
129+
VALUE_NOT_NA = sum(!is.na(VALUE)) / length(VALUE),
130+
TOTAL = length(VALUE)
131+
)
132+
133+
unique(select(statcan_employ, "STATUS"))
134+
statcan_employ %>%
135+
select(STATUS, VALUE) %>%
136+
group_by(STATUS) %>%
137+
count()
138+
139+
unique(select(statcan_employ, "SYMBOL")) # All NA
140+
unique(select(statcan_employ, "TERMINATED")) # All NA
141+
unique(select(statcan_employ, "DECIMALS")) # All 0
142+
143+
unique(select(statcan_employ, "GeoUID"))
144+
unique(select(statcan_employ, "Hierarchy for GEO"))
145+
statcan_employ %>%
146+
group_by_at(c("GEO", "DGUID", "GeoUID", "Hierarchy for GEO")) %>%
147+
count()
148+
# These 4 columns are redundant. Just keep GEO.
149+
150+
# The next 4 columns are metadata about the last 2 columns
151+
# ignore these in favour of the descriptive ones
152+
unique(select(statcan_employ, "Classification Code for Type of employee")) # All NA
153+
unique(select(statcan_employ, "Hierarchy for Type of employee"))
154+
unique(select(statcan_employ, "Classification Code for North American Industry Classification System (NAICS)"))
155+
unique(select(statcan_employ, "Hierarchy for North American Industry Classification System (NAICS)"))
156+
157+
# val_norm and VALUE are the same
158+
unique(select(statcan_employ, "val_norm"))
159+
statcan_employ %>% filter(VALUE != val_norm) %>% count()
160+
161+
unique(select(statcan_employ, "Date"))
162+
# Each date has a minimum of 7522 data points
163+
statcan_employ %>% group_by(Date) %>% count() %>% ungroup() %>% select(n) %>% min()
164+
# Are there any dates that aren't on the 1st of the month?
165+
statcan_employ %>% filter(format(as.Date(Date), "%d") != "01") %>% nrow() # 0
166+
167+
unique(select(statcan_employ, "Type of employee")) # 3 types
168+
unique(select(statcan_employ, "North American Industry Classification System (NAICS)")) # lots
169+
170+
# REF_DATE looks like YYYY-mm
171+
# Date looks like YYYY-mm-dd
172+
# Check that the truncated Date to REF_DATE format always matches the REF_DATE
173+
statcan_employ %>%
174+
select(REF_DATE, Date) %>%
175+
mutate(Date_trunc = format(as.Date(Date), "%Y-%m")) %>%
176+
filter(REF_DATE != Date_trunc) # all empty! good
177+
178+
# This is an example plot
179+
# library(ggplot2)
180+
# theme_set(theme_bw())
181+
#
182+
# employ <- statcan_employ_subset %>%
183+
# dplyr::filter(
184+
# geo_value %in% c("British Columbia", "Ontario") &
185+
# naics_industry == "Real estate and rental and leasing [53]") %>%
186+
# dplyr::arrange(geo_value, time_value)
187+
#
188+
# employ %>% ggplot(aes(x = time_value, y = ppl_count, color=employee_type)) +
189+
# geom_line() +
190+
# facet_wrap(vars(geo_value), scales = "free_y", ncol = 1) +
191+
# scale_x_date(minor_breaks = "month", date_labels = "%b %y") +
192+
# labs(x = "Date", y = "Number employed")

data/statcan_employ_subset.rda

248 KB
Binary file not shown.

man/create_layer.Rd

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/statcan_employ_subset.Rd

Lines changed: 39 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)