Skip to content

Commit 2fb48b7

Browse files
authored
feat: deprecate academies_master_list.csv file (#1386)
* feat: deprecate academies_master_list.csv file - remove dependency on this file from `build_academy_data()` - add fields to input fields for AAR data to cover those missing - update phase-mapping to accommodate potentially fewer sources * fix: amend phase-type mappings updated to derive from GIAS values only. * fix: remove gias-all-links from academies - no longer read `gias_all_links.csv` during academies processing - Trust data sourced from `aar_cs.csv` - amended join to CFO data based on Co. No. rather than URN - `Group UID` no longer written to DB (unused and `DEFAULT NULL`)
1 parent fc10c9a commit 2fb48b7

File tree

11 files changed

+158
-230
lines changed

11 files changed

+158
-230
lines changed

data-pipeline/src/pipeline/database.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -141,15 +141,15 @@ def insert_schools_and_trusts_and_local_authorities(
141141
projections = {
142142
"URN": "URN",
143143
"EstablishmentName": "SchoolName",
144-
"Companies House Number": "TrustCompanyNumber",
145-
"Group Name": "TrustName",
144+
"Company Registration Number": "TrustCompanyNumber",
145+
"Company_Name": "TrustName",
146146
"Federation Lead School URN": "FederationLeadURN",
147147
"Federation Name": "FederationLeadName",
148148
"LA Code": "LACode",
149149
"LA Name": "LAName",
150150
"London Weighting": "LondonWeighting",
151151
"Finance Type": "FinanceType",
152-
"Overall Phase": "OverallPhase",
152+
"SchoolPhaseType": "OverallPhase",
153153
"TypeOfEstablishment (name)": "SchoolType",
154154
"Has Sixth Form": "HasSixthForm",
155155
"Has Nursery": "HasNursery",
@@ -176,19 +176,18 @@ def insert_schools_and_trusts_and_local_authorities(
176176
logger.info(f"Wrote {len(write_frame)} rows to school {run_type} - {year}")
177177

178178
trust_projections = {
179-
"Group Name": "TrustName",
180-
"Group UID": "UID",
179+
"Company_Name": "TrustName",
181180
"CFO name": "CFOName",
182181
"CFO email": "CFOEmail",
183182
"OpenDate": "OpenDate",
184-
"Companies House Number": "CompanyNumber",
183+
"Company Registration Number": "CompanyNumber",
185184
}
186185

187186
trusts = (
188-
df[~df["Companies House Number"].isna()]
187+
df[~df["Company Registration Number"].isna()]
189188
.reset_index()
190-
.sort_values(by=["Companies House Number", "OpenDate"], ascending=False)
191-
.groupby(["Companies House Number"])
189+
.sort_values(by=["Company Registration Number", "OpenDate"], ascending=False)
190+
.groupby(["Company Registration Number"])
192191
.first()
193192
.reset_index()
194193
.rename(columns=trust_projections)[[*trust_projections.values()]]

data-pipeline/src/pipeline/input_schemas.py

Lines changed: 3 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -57,27 +57,6 @@
5757
"LinkEstablishedDate": "string",
5858
}
5959

60-
academy_master_list_index_col = "LA Establishment Number"
61-
academy_master_list = {
62-
"Company Registration Number": "string",
63-
"Incorporation Date": "string",
64-
"Academy Trust UPIN": "Int64",
65-
"Academy Trust Name": "string",
66-
"Academy Name": "string",
67-
"Academy UPIN": "Int64",
68-
"Trust Type": "string",
69-
"LA Establishment Number": "string",
70-
"Date Opened": "string",
71-
"Type of Provision - Phase": "string",
72-
"Regional School Commissioner": "string",
73-
"Valid From": "string",
74-
"Valid to": "string",
75-
"Territory": "string",
76-
"Academy Status": "string",
77-
"Academy Trust Status": "string",
78-
"Number of Academies in Trust": "Int64",
79-
}
80-
8160
maintained_schools_master_list_index_col = "URN"
8261
maintained_schools_master_list = {
8362
"URN": "Int64",
@@ -321,11 +300,14 @@
321300
"BNCH21606 (Agency supply teaching staff)": "float",
322301
"BNCH21403 (Energy)": "float",
323302
"BNCH21402 (Water and sewerage)": "float",
303+
"Valid To": "string",
324304
}
325305

326306
aar_central_services_index_col = "Lead_UPIN"
327307
aar_central_services = {
328308
"Lead_UPIN": "Int64",
309+
"Company_Number": "string",
310+
"Company_Name": "string",
329311
"BNCH11110T (EFA Revenue Grants)": "float",
330312
"BNCH11131 (DfE Family Revenue Grants)": "float",
331313
"BNCH11141 (SEN)": "float",

data-pipeline/src/pipeline/main.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ def pre_process_academy_ar(run_type, year) -> tuple[pd.DataFrame, pd.DataFrame]:
122122
academy_ar_data = get_blob(
123123
raw_container, f"{run_type}/{year}/aar.csv", encoding="utf-8"
124124
)
125+
125126
aar = prepare_aar_data(academy_ar_data, year)
126127

127128
write_blob(
@@ -187,17 +188,7 @@ def pre_process_academies_data(run_type, year, data_ref) -> pd.DataFrame:
187188
logger.info("Building Academy Set")
188189
schools, census, sen, cdc, aar, ks2, ks4, cfo, central_services = data_ref
189190

190-
academies_data = get_blob(
191-
raw_container, f"{run_type}/{year}/academy_master_list.csv", encoding="utf-8"
192-
)
193-
194-
links_data = get_blob(
195-
raw_container, f"{run_type}/{year}/gias_all_links.csv", encoding="cp1252"
196-
)
197-
198191
academies = build_academy_data(
199-
academies_data,
200-
links_data,
201192
year,
202193
schools,
203194
census,
@@ -209,6 +200,7 @@ def pre_process_academies_data(run_type, year, data_ref) -> pd.DataFrame:
209200
cfo,
210201
central_services,
211202
)
203+
212204
write_blob(
213205
"pre-processed",
214206
f"{run_type}/{year}/academies.parquet",

data-pipeline/src/pipeline/maintained_schools.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,8 @@ def map_school_type_attrs(maintained_schools: pd.DataFrame) -> pd.DataFrame:
107107
maintained_schools["Finance Type"] = "Maintained"
108108
maintained_schools["SchoolPhaseType"] = maintained_schools.apply(
109109
lambda df: mappings.map_phase_type(
110-
df["TypeOfEstablishment (code)"], df["PhaseOfEducation (code)"], df["Overall Phase"]
110+
establishment_code=df["TypeOfEstablishment (code)"],
111+
phase_code=df["PhaseOfEducation (code)"],
111112
),
112113
axis=1,
113114
)
@@ -169,6 +170,7 @@ def calc_rag_cost_series(
169170

170171
return maintained_schools
171172

173+
172174
# net catering cost, not net catering income
173175
def calc_catering_net_costs(maintained_schools: pd.DataFrame) -> pd.DataFrame:
174176
maintained_schools["Catering staff and supplies_Net Costs"] = (

data-pipeline/src/pipeline/mappings.py

Lines changed: 30 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -14,46 +14,47 @@ def map_ofsted_rating(rating: str):
1414
return rating
1515

1616

17-
def map_phase_type(establishment_code: int, phase_code: int, provision: str):
17+
def _map_secondary_phases(establishment_code: int) -> str:
18+
match establishment_code:
19+
case 40:
20+
return "University Technical College"
21+
case _:
22+
return "Secondary"
23+
1824

19-
if (pd.isna(establishment_code) or pd.isna(phase_code) or pd.isna(provision)):
20-
return
21-
25+
def _map_not_applicable_phases(establishment_code: int) -> str:
2226
match establishment_code:
23-
case 33 | 36 | 44:
27+
case 6:
28+
return "University Technical College"
29+
case 7 | 12 | 33 | 36 | 44:
2430
return "Special"
31+
case 14:
32+
return "Pupil Referral Unit"
2533
case 38 | 42 | 43:
2634
return "Alternative Provision"
27-
case 40:
28-
return "University Technical College"
29-
case 39 | 45 | 46:
30-
return "Post-16"
35+
case _:
36+
return "Unknown"
3137

38+
39+
def map_phase_type(
40+
establishment_code: int,
41+
phase_code: int,
42+
) -> str:
3243
match phase_code:
33-
case 7:
34-
return "All-through"
44+
case 0:
45+
return _map_not_applicable_phases(establishment_code)
46+
case 1:
47+
return "Nursery"
3548
case 2 | 3:
3649
return "Primary"
3750
case 4 | 5:
38-
return "Secondary"
39-
40-
match provision.lower():
41-
case "16 plus" | "post-16":
51+
return _map_secondary_phases(establishment_code)
52+
case 6:
4253
return "Post-16"
43-
case "secondary":
44-
return "Secondary"
45-
case "special":
46-
return "Special"
47-
case "primary":
48-
return "Primary"
49-
case "all through" | "all-through":
54+
case 7:
5055
return "All-through"
51-
case "nursery":
52-
return "Nursery"
53-
case "pupil referral unit":
54-
return "Pupil Referral Unit"
5556
case _:
56-
return "Other"
57+
return "Unknown"
5758

5859

5960
def map_block_age(block_age: str):
@@ -236,8 +237,8 @@ def map_cost_series(category_name, df, basis):
236237

237238
for sub_category in sub_categories:
238239
df[sub_category + "_Per Unit"] = df[sub_category].fillna(0) / basis
239-
df[sub_category + "_Per Unit"].replace(
240-
[np.inf, -np.inf, np.nan], 0, inplace=True
240+
df[sub_category + "_Per Unit"] = df[sub_category + "_Per Unit"].replace(
241+
[np.inf, -np.inf, np.nan], 0
241242
)
242243

243244
return df

0 commit comments

Comments
 (0)