Skip to content

Commit 9f8f718

Browse files
Merge pull request #182 from UBC-DSCI/alt5-method-chaining
alt5 method-based syntax updates to all chapters
2 parents dc6a4a7 + e111acc commit 9f8f718

10 files changed

+268
-321
lines changed

source/classification1.md

+38-47
Original file line numberDiff line numberDiff line change
@@ -291,9 +291,9 @@ is colorblind-friendly, so we can stick with that here.
291291

292292
```{code-cell} ipython3
293293
perim_concav = alt.Chart(cancer).mark_circle().encode(
294-
x=alt.X("Perimeter", title="Perimeter (standardized)"),
295-
y=alt.Y("Concavity", title="Concavity (standardized)"),
296-
color=alt.Color("Class", title="Diagnosis"),
294+
x=alt.X("Perimeter").title("Perimeter (standardized)"),
295+
y=alt.Y("Concavity").title("Concavity (standardized)"),
296+
color=alt.Color("Class").title("Diagnosis")
297297
)
298298
perim_concav
299299
```
@@ -371,17 +371,13 @@ depicted by the red, diamond point in {numref}`fig:05-knn-2`.
371371
:tags: [remove-cell]
372372
373373
perim_concav_with_new_point = (
374-
alt.Chart(
375-
perim_concav_with_new_point_df,
376-
)
374+
alt.Chart(perim_concav_with_new_point_df)
377375
.mark_point(opacity=0.6, filled=True, size=40)
378376
.encode(
379-
x=alt.X("Perimeter", title="Perimeter (standardized)"),
380-
y=alt.Y("Concavity", title="Concavity (standardized)"),
381-
color=alt.Color("Class", title="Diagnosis"),
382-
shape=alt.Shape(
383-
"Class", scale=alt.Scale(range=["circle", "circle", "diamond"])
384-
),
377+
x=alt.X("Perimeter").title("Perimeter (standardized)"),
378+
y=alt.Y("Concavity").title("Concavity (standardized)"),
379+
color=alt.Color("Class").title("Diagnosis"),
380+
shape=alt.Shape("Class").scale(range=["circle", "circle", "diamond"]),
385381
size=alt.condition("datum.Class == 'Unknown'", alt.value(100), alt.value(30)),
386382
stroke=alt.condition("datum.Class == 'Unknown'", alt.value('black'), alt.value(None)),
387383
)
@@ -1438,9 +1434,9 @@ rare_cancer = pd.concat((
14381434
))
14391435
14401436
rare_plot = alt.Chart(rare_cancer).mark_circle().encode(
1441-
x=alt.X("Perimeter", title="Perimeter (standardized)"),
1442-
y=alt.Y("Concavity", title="Concavity (standardized)"),
1443-
color=alt.Color("Class", title="Diagnosis"),
1437+
x=alt.X("Perimeter").title("Perimeter (standardized)"),
1438+
y=alt.Y("Concavity").title("Concavity (standardized)"),
1439+
color=alt.Color("Class").title("Diagnosis")
14441440
)
14451441
rare_plot
14461442
```
@@ -1822,44 +1818,39 @@ prediction_table["Class"] = knnPredGrid
18221818
18231819
# plot:
18241820
# 1. the colored scatter of the original data
1825-
unscaled_plot = (
1826-
alt.Chart(
1827-
unscaled_cancer,
1828-
)
1829-
.mark_point(opacity=0.6, filled=True, size=40)
1830-
.encode(
1831-
x=alt.X(
1832-
"Area",
1833-
title="Area",
1834-
scale=alt.Scale(
1835-
domain=(unscaled_cancer["Area"].min() * 0.95, unscaled_cancer["Area"].max() * 1.05),
1836-
nice=False
1821+
unscaled_plot = alt.Chart(unscaled_cancer).mark_point(
1822+
opacity=0.6,
1823+
filled=True,
1824+
size=40
1825+
).encode(
1826+
x=alt.X("Area")
1827+
.scale(
1828+
nice=False,
1829+
domain=(
1830+
unscaled_cancer["Area"].min() * 0.95,
1831+
unscaled_cancer["Area"].max() * 1.05
18371832
)
18381833
),
1839-
y=alt.Y(
1840-
"Smoothness",
1841-
title="Smoothness",
1842-
scale=alt.Scale(
1843-
domain=(
1844-
unscaled_cancer["Smoothness"].min() * 0.95,
1845-
unscaled_cancer["Smoothness"].max() * 1.05,
1846-
),
1847-
nice=False
1848-
),
1834+
y=alt.Y("Smoothness")
1835+
.scale(
1836+
nice=False,
1837+
domain=(
1838+
unscaled_cancer["Smoothness"].min() * 0.95,
1839+
unscaled_cancer["Smoothness"].max() * 1.05
1840+
)
18491841
),
1850-
color=alt.Color("Class", title="Diagnosis"),
1851-
)
1842+
color=alt.Color("Class").title("Diagnosis")
18521843
)
18531844
18541845
# 2. the faded colored scatter for the grid points
1855-
prediction_plot = (
1856-
alt.Chart(prediction_table)
1857-
.mark_point(opacity=0.05, filled=True, size=300)
1858-
.encode(
1859-
x="Area",
1860-
y="Smoothness",
1861-
color=alt.Color("Class", title="Diagnosis"),
1862-
)
1846+
prediction_plot = alt.Chart(prediction_table).mark_point(
1847+
opacity=0.05,
1848+
filled=True,
1849+
size=300
1850+
).encode(
1851+
x="Area",
1852+
y="Smoothness",
1853+
color=alt.Color("Class").title("Diagnosis")
18631854
)
18641855
unscaled_plot + prediction_plot
18651856
```

source/classification2.md

+17-25
Original file line numberDiff line numberDiff line change
@@ -331,9 +331,9 @@ cancer['Class'] = cancer['Class'].replace({
331331
# labeling the points be diagnosis class
332332
333333
perim_concav = alt.Chart(cancer).mark_circle().encode(
334-
x=alt.X("Smoothness", scale=alt.Scale(zero=False)),
334+
x=alt.X("Smoothness").scale(zero=False),
335335
y="Concavity",
336-
color=alt.Color("Class", title="Diagnosis"),
336+
color=alt.Color("Class").title("Diagnosis")
337337
)
338338
perim_concav
339339
```
@@ -1072,20 +1072,17 @@ accuracies_grid
10721072

10731073
We can decide which number of neighbors is best by plotting the accuracy versus $K$,
10741074
as shown in {numref}`fig:06-find-k`.
1075+
Here we are using the shortcut `point=True`
1076+
to layer a point and line chart.
10751077

10761078
```{code-cell} ipython3
10771079
:tags: [remove-output]
10781080
10791081
accuracy_vs_k = alt.Chart(accuracies_grid).mark_line(point=True).encode(
1080-
x=alt.X(
1081-
"n_neighbors",
1082-
title="Neighbors",
1083-
),
1084-
y=alt.Y(
1085-
"mean_test_score",
1086-
title="Accuracy estimate",
1087-
scale=alt.Scale(domain=(0.85, 0.90)),
1088-
)
1082+
x=alt.X("n_neighbors").title("Neighbors"),
1083+
y=alt.Y("mean_test_score")
1084+
.scale(domain=(0.85, 0.90))
1085+
.title("Accuracy estimate")
10891086
)
10901087
10911088
accuracy_vs_k
@@ -1155,22 +1152,17 @@ large_cancer_tune_grid = GridSearchCV(
11551152
)
11561153
11571154
large_accuracies_grid = pd.DataFrame(
1158-
large_cancer_tune_grid.fit(
1159-
cancer_train.loc[:, ["Smoothness", "Concavity"]],
1160-
cancer_train["Class"]
1161-
).cv_results_
1162-
)
1155+
large_cancer_tune_grid.fit(
1156+
cancer_train.loc[:, ["Smoothness", "Concavity"]],
1157+
cancer_train["Class"]
1158+
).cv_results_
1159+
)
11631160
11641161
large_accuracy_vs_k = alt.Chart(large_accuracies_grid).mark_line(point=True).encode(
1165-
x=alt.X(
1166-
"param_kneighborsclassifier__n_neighbors",
1167-
title="Neighbors",
1168-
),
1169-
y=alt.Y(
1170-
"mean_test_score",
1171-
title="Accuracy estimate",
1172-
scale=alt.Scale(domain=(0.60, 0.90)),
1173-
)
1162+
x=alt.X("param_kneighborsclassifier__n_neighbors").title("Neighbors"),
1163+
y=alt.Y("mean_test_score")
1164+
.scale(domain=(0.60, 0.90))
1165+
.title("Accuracy estimate")
11741166
)
11751167
11761168
large_accuracy_vs_k

source/clustering.md

+8-8
Original file line numberDiff line numberDiff line change
@@ -195,8 +195,8 @@ to see if we can detect subtypes or groups in our data set.
195195
import altair as alt
196196
197197
scatter_plot = alt.Chart(penguin_data).mark_circle().encode(
198-
x=alt.X("flipper_length_standardized", title="Flipper Length (standardized)"),
199-
y=alt.Y("bill_length_standardized", title="Bill Length (standardized)")
198+
x=alt.X("flipper_length_standardized").title("Flipper Length (standardized)"),
199+
y=alt.Y("bill_length_standardized").title("Bill Length (standardized)")
200200
)
201201
```
202202

@@ -219,7 +219,7 @@ Scatter plot of standardized bill length versus standardized flipper length.
219219
Based on the visualization
220220
in {numref}`scatter_plot`,
221221
we might suspect there are a few subtypes of penguins within our data set.
222-
We can see roughly 3 groups of observations in {numref}`scatter`,
222+
We can see roughly 3 groups of observations in {numref}`scatter_plot`,
223223
including:
224224

225225
1. a small flipper and bill length group,
@@ -630,9 +630,9 @@ of the cluster assignments for each point, as shown in {numref}`cluster_plot`.
630630

631631
```{code-cell} ipython3
632632
cluster_plot=alt.Chart(clustered_data).mark_circle().encode(
633-
x=alt.X("flipper_length_mm", title="Flipper Length (standardized)"),
634-
y=alt.Y("bill_length_mm", title="Bill Length (standardized)"),
635-
color=alt.Color("cluster:N", title="Cluster"),
633+
x=alt.X("flipper_length_mm").title("Flipper Length (standardized)"),
634+
y=alt.Y("bill_length_mm").title("Bill Length (standardized)"),
635+
color=alt.Color("cluster:N").title("Cluster"),
636636
)
637637
```
638638

@@ -718,8 +718,8 @@ Now that we have `inertia` and `k` as columns in a data frame, we can make a lin
718718

719719
```{code-cell} ipython3
720720
elbow_plot = alt.Chart(penguin_clust_ks).mark_line().encode(
721-
x=alt.X("k", title="K"),
722-
y=alt.Y("inertia", title="Total within-cluster sum of squares"),
721+
x=alt.X("k").title("K"),
722+
y=alt.Y("inertia").title("Total within-cluster sum of squares"),
723723
)
724724
```
725725

source/img/faithful_plot.png

-2.54 KB
Loading

source/img/faithful_plot.svg

+1-1
Loading

source/inference.md

+42-42
Original file line numberDiff line numberDiff line change
@@ -348,8 +348,10 @@ sampling distribution directly for learning purposes.
348348
:tags: [remove-output]
349349
350350
sampling_distribution = alt.Chart(sample_estimates).mark_bar().encode(
351-
x=alt.X("sample_proportion", title="Sample proportions", bin=alt.Bin(maxbins=20)),
352-
y=alt.Y("count()", title="Count"),
351+
x=alt.X("sample_proportion")
352+
.bin(maxbins=20)
353+
.title("Sample proportions"),
354+
y=alt.Y("count()").title("Count"),
353355
)
354356
355357
sampling_distribution
@@ -424,11 +426,9 @@ We can visualize the population distribution of the price per night with a histo
424426
:tags: [remove-output]
425427
426428
population_distribution = alt.Chart(airbnb).mark_bar().encode(
427-
x=alt.X(
428-
"price",
429-
bin=alt.Bin(maxbins=30),
430-
title="Price per night (Canadian dollars)"
431-
),
429+
x=alt.X("price")
430+
.bin(maxbins=30)
431+
.title("Price per night (Canadian dollars)"),
432432
y=alt.Y("count()", title="Count"),
433433
)
434434
@@ -499,8 +499,10 @@ of our sample.
499499
:tags: [remove-output]
500500
501501
sample_distribution = alt.Chart(one_sample).mark_bar().encode(
502-
x=alt.X("price", bin=alt.Bin(maxbins=30), title="Price per night (Canadian dollars)"),
503-
y=alt.Y("count()", title="Count"),
502+
x=alt.X("price")
503+
.bin(maxbins=30)
504+
.title("Price per night (Canadian dollars)"),
505+
y=alt.Y("count()").title("Count"),
504506
)
505507
506508
sample_distribution
@@ -571,12 +573,10 @@ sample_estimates
571573
:tags: [remove-output]
572574
573575
sampling_distribution = alt.Chart(sample_estimates).mark_bar().encode(
574-
x=alt.X(
575-
"sample_mean",
576-
bin=alt.Bin(maxbins=30),
577-
title="Sample mean price per night (Canadian dollars)",
578-
),
579-
y=alt.Y("count()", title="Count"),
576+
x=alt.X("sample_mean")
577+
.bin(maxbins=30)
578+
.title("Sample mean price per night (Canadian dollars)"),
579+
y=alt.Y("count()").title("Count")
580580
)
581581
582582
sampling_distribution
@@ -645,15 +645,23 @@ glue(
645645
population_distribution.mark_bar(clip=True).encode(
646646
x=alt.X(
647647
"price",
648-
bin=alt.Bin(maxbins=30),
648+
bin=alt.Bin(extent=[0, 660], maxbins=40),
649649
title="Price per night (Canadian dollars)",
650-
scale=alt.Scale(domainMax=700)
650+
#scale=alt.Scale(domainMax=700)
651651
)
652652
).properties(
653653
title='Population', height=150
654654
),
655-
sample_distribution.properties(title="Sample (n = 40)").properties(height=150),
656-
sampling_distribution.properties(
655+
sample_distribution.encode(
656+
x=alt.X("price")
657+
.bin(extent=[0, 660], maxbins=40)
658+
.title("Price per night (Canadian dollars)")
659+
).properties(title="Sample (n = 40)").properties(height=150),
660+
sampling_distribution.encode(
661+
x=alt.X("sample_mean")
662+
.bin(extent=[0, 660], maxbins=40)
663+
.title("Price per night (Canadian dollars)")
664+
).properties(
657665
title=alt.TitleParams(
658666
"Sampling distribution of the mean",
659667
subtitle="For 20,000 samples of size 40"
@@ -934,12 +942,10 @@ one_sample
934942
:tags: []
935943
936944
one_sample_dist = alt.Chart(one_sample).mark_bar().encode(
937-
x=alt.X(
938-
"price",
939-
bin=alt.Bin(maxbins=30),
940-
title="Price per night (Canadian dollars)",
941-
),
942-
y=alt.Y("count()", title="Count"),
945+
x=alt.X("price")
946+
.bin(maxbins=30)
947+
.title("Price per night (Canadian dollars)"),
948+
y=alt.Y("count()").title("Count"),
943949
)
944950
945951
one_sample_dist
@@ -976,11 +982,9 @@ we change the `replace` parameter to `True`.
976982
977983
boot1 = one_sample.sample(frac=1, replace=True)
978984
boot1_dist = alt.Chart(boot1).mark_bar().encode(
979-
x=alt.X(
980-
"price",
981-
bin=alt.Bin(maxbins=30),
982-
title="Price per night (Canadian dollars)",
983-
),
985+
x=alt.X("price")
986+
.bin(maxbins=30)
987+
.title("Price per night (Canadian dollars)"),
984988
y=alt.Y("count()", title="Count"),
985989
)
986990
@@ -1031,12 +1035,10 @@ Let's take a look at histograms of the first six replicates of our bootstrap sam
10311035
10321036
six_bootstrap_samples = boot20000.query("replicate < 6")
10331037
alt.Chart(six_bootstrap_samples, height=150).mark_bar().encode(
1034-
x=alt.X(
1035-
"price",
1036-
bin=alt.Bin(maxbins=20),
1037-
title="Price per night (Canadian dollars)",
1038-
),
1039-
y=alt.Y("count()", title="Count")
1038+
x=alt.X("price")
1039+
.bin(maxbins=20)
1040+
.title("Price per night (Canadian dollars)"),
1041+
y=alt.Y("count()").title("Count")
10401042
).facet(
10411043
"replicate",
10421044
columns=2
@@ -1099,12 +1101,10 @@ boot20000_means
10991101
:tags: []
11001102
11011103
boot_est_dist = alt.Chart(boot20000_means).mark_bar().encode(
1102-
x=alt.X(
1103-
"sample_mean",
1104-
bin=alt.Bin(maxbins=20),
1105-
title="Sample mean price per night (Canadian dollars)",
1106-
),
1107-
y=alt.Y("count()", title="Count"),
1104+
x=alt.X("sample_mean")
1105+
.bin(maxbins=20)
1106+
.title("Sample mean price per night (Canadian dollars)"),
1107+
y=alt.Y("count()").title("Count"),
11081108
)
11091109
11101110
boot_est_dist

0 commit comments

Comments
 (0)