|
| 1 | +library(epipredict) |
| 2 | +library(epiprocess) |
| 3 | +library(cansim) |
| 4 | +library(dplyr) |
| 5 | +library(stringr) |
| 6 | + |
| 7 | +# Run this once |
| 8 | +# https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1410022001#data |
| 9 | +statcan_employ <- get_cansim("14-10-0201-01") |
| 10 | + |
| 11 | +# ================== Subset & Filtering ================== |
| 12 | +employ <- statcan_employ %>% |
| 13 | + select(c( |
| 14 | + "REF_DATE", |
| 15 | + "GEO", |
| 16 | + # "DGUID", |
| 17 | + # "UOM", |
| 18 | + # "UOM_ID", |
| 19 | + # "SCALAR_FACTOR", |
| 20 | + # "SCALAR_ID", |
| 21 | + # "VECTOR", |
| 22 | + # "COORDINATE", |
| 23 | + "VALUE", |
| 24 | + "STATUS", |
| 25 | + # "SYMBOL", |
| 26 | + # "TERMINATED", |
| 27 | + # "DECIMALS", |
| 28 | + # "GeoUID", |
| 29 | + # "Hierarchy for GEO", |
| 30 | + # "Classification Code for Type of employee", |
| 31 | + # "Hierarchy for Type of employee", |
| 32 | + "Classification Code for North American Industry Classification System (NAICS)", |
| 33 | + # "Hierarchy for North American Industry Classification System (NAICS)", |
| 34 | + # "val_norm", |
| 35 | + # "Date", |
| 36 | + "Type of employee", |
| 37 | + "North American Industry Classification System (NAICS)")) %>% |
| 38 | + rename( |
| 39 | + "geo_value" = "GEO", |
| 40 | + "time_value" = "REF_DATE", |
| 41 | + "ppl_count" = "VALUE", |
| 42 | + "status" = "STATUS", |
| 43 | + "employee_type" = "Type of employee", |
| 44 | + "naics_industry" = "North American Industry Classification System (NAICS)", |
| 45 | + "naics_code" = "Classification Code for North American Industry Classification System (NAICS)" |
| 46 | + ) %>% |
| 47 | + mutate(time_value = tsibble::yearmonth(time_value, "%Y-%m")) %>% |
| 48 | + # https://www.statcan.gc.ca/en/concepts/definitions/guide-symbol |
| 49 | + # .. not available for a specific reference period |
| 50 | + # x: suppressed to meet the confidentiality requirements of the Statistics Act |
| 51 | + # A data quality: excellent |
| 52 | + # B data quality: very good |
| 53 | + # C data quality: good |
| 54 | + # [blank] or A-D: acceptable or better <- only keep these ones |
| 55 | + # E use with caution |
| 56 | + # F too unreliable to be published |
| 57 | + filter( |
| 58 | + status %in% c("A", "B", "C", "D", NA) & # only keep "good" data |
| 59 | + !is.na(ppl_count) & |
| 60 | + geo_value != "Canada" & # only keep provinces |
| 61 | + # N corresponds to aggregates |
| 62 | + !str_detect(naics_code, "N") & |
| 63 | + # only keep top-level sectors |
| 64 | + # https://www.census.gov/programs-surveys/economic-census/guidance/understanding-naics.html |
| 65 | + # corresponds to either [##] or [##-##] |
| 66 | + str_detect(naics_code, "(\\[[0-9]{2}\\])|(\\[[0-9]{2}-[0-9]{2}\\])") & |
| 67 | + # [00] corresponds to unclassified |
| 68 | + naics_code != "[00]") %>% |
| 69 | + select(-c(status, naics_code)) %>% |
| 70 | + # re-level the industry column because we dropped factors |
| 71 | + mutate(naics_industry = factor(naics_industry)) |
| 72 | + |
| 73 | +# head(employ) |
| 74 | +# statcan_employ_subset <- employ %>% |
| 75 | +# tsibble::as_tsibble( |
| 76 | +# index=time_value, |
| 77 | +# key=c(geo_value, employee_type, naics_industry)) %>% |
| 78 | +# as_epi_df(time_type = "yearmonth", as_of="2022-07-28") |
| 79 | + |
| 80 | +statcan_employ_subset <- employ %>% |
| 81 | + tsibble::as_tsibble(index=time_value, key=c(geo_value, employee_type, naics_industry)) %>% |
| 82 | + as_epi_df( |
| 83 | + additional_metadata=c(other_keys=c("employee_type", "naics_industry"))) |
| 84 | + |
| 85 | +usethis::use_data(statcan_employ_subset, overwrite = TRUE) |
| 86 | + |
| 87 | +# ================== EDA ================== |
| 88 | +length(statcan_employ$REF_DATE) |
| 89 | +names(statcan_employ) |
| 90 | + |
| 91 | +uniq_ref_date <- unique(select(statcan_employ, "REF_DATE")) |
| 92 | +uniq_ref_date |
| 93 | +min(statcan_employ$REF_DATE) # 2001-01 |
| 94 | +max(statcan_employ$REF_DATE) # 2022-05 |
| 95 | +# There should be (22-1)*12 + 5 |
| 96 | +exp_total_dates <- (22-1)*12+5 |
| 97 | +length(uniq_ref_date %>% unlist()) == exp_total_dates # TRUE |
| 98 | +# There is a ref date for each month in the date range |
| 99 | + |
| 100 | +unique(select(statcan_employ, "GEO")) # List of length 14, names of provinces & territories + Canada |
| 101 | +statcan_employ %>% group_by(GEO) %>% top_n(n=-1) |
| 102 | +# Dissemination Geography Unique Identifier - DGUID. |
| 103 | +# https://www12.statcan.gc.ca/census-recensement/2021/ref/dict/az/Definition-eng.cfm?ID=geo055 |
| 104 | +# 2016 (year) |
| 105 | +# A (administrative) |
| 106 | +unique(select(statcan_employ, "DGUID")) |
| 107 | +unique(select(statcan_employ, "UOM")) # Persons |
| 108 | +unique(select(statcan_employ, "UOM_ID")) # 249 |
| 109 | + |
| 110 | +# These scalar columns tell us by what factor of 10 to multiply the values |
| 111 | +# We get "units" and 0 so we can ignore these columns and |
| 112 | +# take the values in VALUE as-is |
| 113 | +unique(select(statcan_employ, "SCALAR_FACTOR")) # All "units" |
| 114 | +unique(select(statcan_employ, "SCALAR_ID")) # All 0 |
| 115 | + |
| 116 | +# Time series identifier - ignore |
| 117 | +unique(select(statcan_employ, "VECTOR")) |
| 118 | + |
| 119 | +# Related to dimension, which is not present in this table - ignore |
| 120 | +unique(select(statcan_employ, "COORDINATE")) |
| 121 | + |
| 122 | +# Data value column. Values in units |
| 123 | +unique(select(statcan_employ, "VALUE")) |
| 124 | +# How many rows have empty values? |
| 125 | +# Approx 3/4 of the rows have NA values |
| 126 | +statcan_employ %>% |
| 127 | + summarise( |
| 128 | + VALUE_NA = sum(is.na(VALUE)) / length(VALUE), |
| 129 | + VALUE_NOT_NA = sum(!is.na(VALUE)) / length(VALUE), |
| 130 | + TOTAL = length(VALUE) |
| 131 | + ) |
| 132 | + |
| 133 | +unique(select(statcan_employ, "STATUS")) |
| 134 | +statcan_employ %>% |
| 135 | + select(STATUS, VALUE) %>% |
| 136 | + group_by(STATUS) %>% |
| 137 | + count() |
| 138 | + |
| 139 | +unique(select(statcan_employ, "SYMBOL")) # All NA |
| 140 | +unique(select(statcan_employ, "TERMINATED")) # All NA |
| 141 | +unique(select(statcan_employ, "DECIMALS")) # All 0 |
| 142 | + |
| 143 | +unique(select(statcan_employ, "GeoUID")) |
| 144 | +unique(select(statcan_employ, "Hierarchy for GEO")) |
| 145 | +statcan_employ %>% |
| 146 | + group_by_at(c("GEO", "DGUID", "GeoUID", "Hierarchy for GEO")) %>% |
| 147 | + count() |
| 148 | +# These 4 columns are redundant. Just keep GEO. |
| 149 | + |
| 150 | +# The next 4 columns are metadata about the last 2 columns |
| 151 | +# ignore these in favour of the descriptive ones |
| 152 | +unique(select(statcan_employ, "Classification Code for Type of employee")) # All NA |
| 153 | +unique(select(statcan_employ, "Hierarchy for Type of employee")) |
| 154 | +unique(select(statcan_employ, "Classification Code for North American Industry Classification System (NAICS)")) |
| 155 | +unique(select(statcan_employ, "Hierarchy for North American Industry Classification System (NAICS)")) |
| 156 | + |
| 157 | +# val_norm and VALUE are the same |
| 158 | +unique(select(statcan_employ, "val_norm")) |
| 159 | +statcan_employ %>% filter(VALUE != val_norm) %>% count() |
| 160 | + |
| 161 | +unique(select(statcan_employ, "Date")) |
| 162 | +# Each date has a minimum of 7522 data points |
| 163 | +statcan_employ %>% group_by(Date) %>% count() %>% ungroup() %>% select(n) %>% min() |
| 164 | +# Are there any dates that aren't on the 1st of the month? |
| 165 | +statcan_employ %>% filter(format(as.Date(Date), "%d") != "01") %>% nrow() # 0 |
| 166 | + |
| 167 | +unique(select(statcan_employ, "Type of employee")) # 3 types |
| 168 | +unique(select(statcan_employ, "North American Industry Classification System (NAICS)")) # lots |
| 169 | + |
| 170 | +# REF_DATE looks like YYYY-mm |
| 171 | +# Date looks like YYYY-mm-dd |
| 172 | +# Check that the truncated Date to REF_DATE format always matches the REF_DATE |
| 173 | +statcan_employ %>% |
| 174 | + select(REF_DATE, Date) %>% |
| 175 | + mutate(Date_trunc = format(as.Date(Date), "%Y-%m")) %>% |
| 176 | + filter(REF_DATE != Date_trunc) # all empty! good |
| 177 | + |
| 178 | +# This is an example plot |
| 179 | +# library(ggplot2) |
| 180 | +# theme_set(theme_bw()) |
| 181 | +# |
| 182 | +# employ <- statcan_employ_subset %>% |
| 183 | +# dplyr::filter( |
| 184 | +# geo_value %in% c("British Columbia", "Ontario") & |
| 185 | +# naics_industry == "Real estate and rental and leasing [53]") %>% |
| 186 | +# dplyr::arrange(geo_value, time_value) |
| 187 | +# |
| 188 | +# employ %>% ggplot(aes(x = time_value, y = ppl_count, color=employee_type)) + |
| 189 | +# geom_line() + |
| 190 | +# facet_wrap(vars(geo_value), scales = "free_y", ncol = 1) + |
| 191 | +# scale_x_date(minor_breaks = "month", date_labels = "%b %y") + |
| 192 | +# labs(x = "Date", y = "Number employed") |
0 commit comments