UBC-DSCI
diff --git a/‎source/classification1.md
+38-47 b/‎source/classification1.md
+38-47
diff --git a/‎source/classification2.md
+17-25 b/‎source/classification2.md
+17-25
diff --git a/‎source/clustering.md
+8-8 b/‎source/clustering.md
+8-8
diff --git a/‎source/img/faithful_plot.png
-2.54 KB b/‎source/img/faithful_plot.png
-2.54 KB
diff --git a/‎source/img/faithful_plot.svg
+1-1 b/‎source/img/faithful_plot.svg
+1-1
diff --git a/‎source/inference.md
+42-42 b/‎source/inference.md
+42-42
@@ -291,9 +291,9 @@ is colorblind-friendly, so we can stick with that here.
 
 ```{code-cell} ipython3
 perim_concav = alt.Chart(cancer).mark_circle().encode(
-    x=alt.X("Perimeter", title="Perimeter (standardized)"),
-    y=alt.Y("Concavity", title="Concavity (standardized)"),
-    color=alt.Color("Class", title="Diagnosis"),
+    x=alt.X("Perimeter").title("Perimeter (standardized)"),
+    y=alt.Y("Concavity").title("Concavity (standardized)"),
+    color=alt.Color("Class").title("Diagnosis")
 )
 perim_concav
 ```
@@ -371,17 +371,13 @@ depicted by the red, diamond point in {numref}`fig:05-knn-2`.
 :tags: [remove-cell]
 
 perim_concav_with_new_point = (
-    alt.Chart(
-        perim_concav_with_new_point_df,
-    )
+    alt.Chart(perim_concav_with_new_point_df)
     .mark_point(opacity=0.6, filled=True, size=40)
     .encode(
-        x=alt.X("Perimeter", title="Perimeter (standardized)"),
-        y=alt.Y("Concavity", title="Concavity (standardized)"),
-        color=alt.Color("Class", title="Diagnosis"),
-        shape=alt.Shape(
-            "Class", scale=alt.Scale(range=["circle", "circle", "diamond"])
-        ),
+        x=alt.X("Perimeter").title("Perimeter (standardized)"),
+        y=alt.Y("Concavity").title("Concavity (standardized)"),
+        color=alt.Color("Class").title("Diagnosis"),
+        shape=alt.Shape("Class").scale(range=["circle", "circle", "diamond"]),
         size=alt.condition("datum.Class == 'Unknown'", alt.value(100), alt.value(30)),
         stroke=alt.condition("datum.Class == 'Unknown'", alt.value('black'), alt.value(None)),
     )
@@ -1438,9 +1434,9 @@ rare_cancer = pd.concat((
 ))
 
 rare_plot = alt.Chart(rare_cancer).mark_circle().encode(
-    x=alt.X("Perimeter", title="Perimeter (standardized)"),
-    y=alt.Y("Concavity", title="Concavity (standardized)"),
-    color=alt.Color("Class", title="Diagnosis"),
+    x=alt.X("Perimeter").title("Perimeter (standardized)"),
+    y=alt.Y("Concavity").title("Concavity (standardized)"),
+    color=alt.Color("Class").title("Diagnosis")
 )
 rare_plot
 ```
@@ -1822,44 +1818,39 @@ prediction_table["Class"] = knnPredGrid
 
 # plot:
 # 1. the colored scatter of the original data
-unscaled_plot = (
-    alt.Chart(
-        unscaled_cancer,
-    )
-    .mark_point(opacity=0.6, filled=True, size=40)
-    .encode(
-        x=alt.X(
-            "Area",
-            title="Area",
-            scale=alt.Scale(
-                domain=(unscaled_cancer["Area"].min() * 0.95, unscaled_cancer["Area"].max() * 1.05),
-            nice=False
+unscaled_plot = alt.Chart(unscaled_cancer).mark_point(
+    opacity=0.6,
+    filled=True,
+    size=40
+).encode(
+    x=alt.X("Area")
+        .scale(
+            nice=False,
+            domain=(
+                unscaled_cancer["Area"].min() * 0.95,
+                unscaled_cancer["Area"].max() * 1.05
             )
         ),
-        y=alt.Y(
-            "Smoothness",
-            title="Smoothness",
-            scale=alt.Scale(
-                domain=(
-                    unscaled_cancer["Smoothness"].min() * 0.95,
-                    unscaled_cancer["Smoothness"].max() * 1.05,
-                ),
-                nice=False
-            ),
+    y=alt.Y("Smoothness")
+        .scale(
+            nice=False,
+            domain=(
+                unscaled_cancer["Smoothness"].min() * 0.95,
+                unscaled_cancer["Smoothness"].max() * 1.05
+            )
         ),
-        color=alt.Color("Class", title="Diagnosis"),
-    )
+    color=alt.Color("Class").title("Diagnosis")
 )
 
 # 2. the faded colored scatter for the grid points
-prediction_plot = (
-    alt.Chart(prediction_table)
-    .mark_point(opacity=0.05, filled=True, size=300)
-    .encode(
-        x="Area",
-        y="Smoothness",
-        color=alt.Color("Class", title="Diagnosis"),
-    )
+prediction_plot = alt.Chart(prediction_table).mark_point(
+    opacity=0.05,
+    filled=True,
+    size=300
+).encode(
+    x="Area",
+    y="Smoothness",
+    color=alt.Color("Class").title("Diagnosis")
 )
 unscaled_plot + prediction_plot
 ```
 
@@ -331,9 +331,9 @@ cancer['Class'] = cancer['Class'].replace({
 # labeling the points be diagnosis class
 
 perim_concav = alt.Chart(cancer).mark_circle().encode(
-    x=alt.X("Smoothness", scale=alt.Scale(zero=False)),
+    x=alt.X("Smoothness").scale(zero=False),
     y="Concavity",
-    color=alt.Color("Class", title="Diagnosis"),
+    color=alt.Color("Class").title("Diagnosis")
 )
 perim_concav
 ```
@@ -1072,20 +1072,17 @@ accuracies_grid
 
 We can decide which number of neighbors is best by plotting the accuracy versus $K$,
 as shown in {numref}`fig:06-find-k`.
+Here we are using the shortcut `point=True`
+to layer a point and line chart.
 
 ```{code-cell} ipython3
 :tags: [remove-output]
 
 accuracy_vs_k = alt.Chart(accuracies_grid).mark_line(point=True).encode(
-    x=alt.X(
-        "n_neighbors",
-        title="Neighbors",
-    ),
-    y=alt.Y(
-        "mean_test_score",
-        title="Accuracy estimate",
-        scale=alt.Scale(domain=(0.85, 0.90)),
-    )
+    x=alt.X("n_neighbors").title("Neighbors"),
+    y=alt.Y("mean_test_score")
+        .scale(domain=(0.85, 0.90))
+        .title("Accuracy estimate")
 )
 
 accuracy_vs_k
@@ -1155,22 +1152,17 @@ large_cancer_tune_grid = GridSearchCV(
 )
 
 large_accuracies_grid = pd.DataFrame(
-                    large_cancer_tune_grid.fit(
-                          cancer_train.loc[:, ["Smoothness", "Concavity"]],
-                          cancer_train["Class"]
-                    ).cv_results_
-                  )
+    large_cancer_tune_grid.fit(
+        cancer_train.loc[:, ["Smoothness", "Concavity"]],
+        cancer_train["Class"]
+    ).cv_results_
+)
 
 large_accuracy_vs_k = alt.Chart(large_accuracies_grid).mark_line(point=True).encode(
-    x=alt.X(
-        "param_kneighborsclassifier__n_neighbors",
-        title="Neighbors",
-    ),
-    y=alt.Y(
-        "mean_test_score",
-        title="Accuracy estimate",
-        scale=alt.Scale(domain=(0.60, 0.90)),
-    )
+    x=alt.X("param_kneighborsclassifier__n_neighbors").title("Neighbors"),
+    y=alt.Y("mean_test_score")
+        .scale(domain=(0.60, 0.90))
+        .title("Accuracy estimate")
 )
 
 large_accuracy_vs_k
 
@@ -195,8 +195,8 @@ to see if we can detect subtypes or groups in our data set.
 import altair as alt
 
 scatter_plot = alt.Chart(penguin_data).mark_circle().encode(
-    x=alt.X("flipper_length_standardized", title="Flipper Length (standardized)"),
-    y=alt.Y("bill_length_standardized", title="Bill Length (standardized)")
+    x=alt.X("flipper_length_standardized").title("Flipper Length (standardized)"),
+    y=alt.Y("bill_length_standardized").title("Bill Length (standardized)")
 )
 ```
 
@@ -219,7 +219,7 @@ Scatter plot of standardized bill length versus standardized flipper length.
 Based on the visualization
 in {numref}`scatter_plot`,
 we might suspect there are a few subtypes of penguins within our data set.
-We can see roughly 3 groups of observations in {numref}`scatter`,
+We can see roughly 3 groups of observations in {numref}`scatter_plot`,
 including:
 
 1. a small flipper and bill length group,
@@ -630,9 +630,9 @@ of the cluster assignments for each point, as shown in {numref}`cluster_plot`.
 
 ```{code-cell} ipython3
 cluster_plot=alt.Chart(clustered_data).mark_circle().encode(
-    x=alt.X("flipper_length_mm", title="Flipper Length (standardized)"),
-    y=alt.Y("bill_length_mm", title="Bill Length (standardized)"),
-    color=alt.Color("cluster:N", title="Cluster"),
+    x=alt.X("flipper_length_mm").title("Flipper Length (standardized)"),
+    y=alt.Y("bill_length_mm").title("Bill Length (standardized)"),
+    color=alt.Color("cluster:N").title("Cluster"),
 )
 ```
 
@@ -718,8 +718,8 @@ Now that we have `inertia` and `k` as columns in a data frame, we can make a lin
 
 ```{code-cell} ipython3
 elbow_plot = alt.Chart(penguin_clust_ks).mark_line().encode(
-    x=alt.X("k", title="K"),
-    y=alt.Y("inertia", title="Total within-cluster sum of squares"),
+    x=alt.X("k").title("K"),
+    y=alt.Y("inertia").title("Total within-cluster sum of squares"),
 )
 ```
 
 
@@ -348,8 +348,10 @@ sampling distribution directly for learning purposes.
 :tags: [remove-output]
 
 sampling_distribution = alt.Chart(sample_estimates).mark_bar().encode(
-    x=alt.X("sample_proportion", title="Sample proportions", bin=alt.Bin(maxbins=20)),
-    y=alt.Y("count()", title="Count"),
+    x=alt.X("sample_proportion")
+        .bin(maxbins=20)
+        .title("Sample proportions"),
+    y=alt.Y("count()").title("Count"),
 )
 
 sampling_distribution
@@ -424,11 +426,9 @@ We can visualize the population distribution of the price per night with a histo
 :tags: [remove-output]
 
 population_distribution = alt.Chart(airbnb).mark_bar().encode(
-    x=alt.X(
-        "price",
-        bin=alt.Bin(maxbins=30),
-        title="Price per night (Canadian dollars)"
-    ),
+    x=alt.X("price")
+        .bin(maxbins=30)
+        .title("Price per night (Canadian dollars)"),
     y=alt.Y("count()", title="Count"),
 )
 
@@ -499,8 +499,10 @@ of our sample.
 :tags: [remove-output]
 
 sample_distribution = alt.Chart(one_sample).mark_bar().encode(
-    x=alt.X("price", bin=alt.Bin(maxbins=30), title="Price per night (Canadian dollars)"),
-    y=alt.Y("count()", title="Count"),
+    x=alt.X("price")
+        .bin(maxbins=30)
+        .title("Price per night (Canadian dollars)"),
+    y=alt.Y("count()").title("Count"),
 )
 
 sample_distribution
@@ -571,12 +573,10 @@ sample_estimates
 :tags: [remove-output]
 
 sampling_distribution = alt.Chart(sample_estimates).mark_bar().encode(
-    x=alt.X(
-        "sample_mean",
-        bin=alt.Bin(maxbins=30),
-        title="Sample mean price per night (Canadian dollars)",
-    ),
-    y=alt.Y("count()", title="Count"),
+    x=alt.X("sample_mean")
+        .bin(maxbins=30)
+        .title("Sample mean price per night (Canadian dollars)"),
+    y=alt.Y("count()").title("Count")
 )
 
 sampling_distribution
@@ -645,15 +645,23 @@ glue(
         population_distribution.mark_bar(clip=True).encode(
             x=alt.X(
                 "price",
-                bin=alt.Bin(maxbins=30),
+                bin=alt.Bin(extent=[0, 660], maxbins=40),
                 title="Price per night (Canadian dollars)",
-                scale=alt.Scale(domainMax=700)
+                #scale=alt.Scale(domainMax=700)
             )
         ).properties(
             title='Population', height=150
         ),
-        sample_distribution.properties(title="Sample (n = 40)").properties(height=150),
-        sampling_distribution.properties(
+        sample_distribution.encode(
+            x=alt.X("price")
+                .bin(extent=[0, 660], maxbins=40)
+                .title("Price per night (Canadian dollars)")
+        ).properties(title="Sample (n = 40)").properties(height=150),
+        sampling_distribution.encode(
+            x=alt.X("sample_mean")
+                .bin(extent=[0, 660], maxbins=40)
+                .title("Price per night (Canadian dollars)")
+        ).properties(
             title=alt.TitleParams(
                 "Sampling distribution of the mean",
                 subtitle="For 20,000 samples of size 40"
@@ -934,12 +942,10 @@ one_sample
 :tags: []
 
 one_sample_dist = alt.Chart(one_sample).mark_bar().encode(
-    x=alt.X(
-        "price",
-        bin=alt.Bin(maxbins=30),
-        title="Price per night (Canadian dollars)",
-    ),
-    y=alt.Y("count()", title="Count"),
+    x=alt.X("price")
+        .bin(maxbins=30)
+        .title("Price per night (Canadian dollars)"),
+    y=alt.Y("count()").title("Count"),
 )
 
 one_sample_dist
@@ -976,11 +982,9 @@ we change the `replace` parameter to `True`.
 
 boot1 = one_sample.sample(frac=1, replace=True)
 boot1_dist = alt.Chart(boot1).mark_bar().encode(
-    x=alt.X(
-        "price",
-        bin=alt.Bin(maxbins=30),
-        title="Price per night (Canadian dollars)",
-    ),
+    x=alt.X("price")
+        .bin(maxbins=30)
+        .title("Price per night (Canadian dollars)"),
     y=alt.Y("count()", title="Count"),
 )
 
@@ -1031,12 +1035,10 @@ Let's take a look at histograms of the first six replicates of our bootstrap sam
 
 six_bootstrap_samples = boot20000.query("replicate < 6")
 alt.Chart(six_bootstrap_samples, height=150).mark_bar().encode(
-    x=alt.X(
-        "price",
-        bin=alt.Bin(maxbins=20),
-        title="Price per night (Canadian dollars)",
-    ),
-    y=alt.Y("count()", title="Count")
+    x=alt.X("price")
+        .bin(maxbins=20)
+        .title("Price per night (Canadian dollars)"),
+    y=alt.Y("count()").title("Count")
 ).facet(
     "replicate",
     columns=2
@@ -1099,12 +1101,10 @@ boot20000_means
 :tags: []
 
 boot_est_dist = alt.Chart(boot20000_means).mark_bar().encode(
-    x=alt.X(
-        "sample_mean",
-        bin=alt.Bin(maxbins=20),
-        title="Sample mean price per night (Canadian dollars)",
-    ),
-    y=alt.Y("count()", title="Count"),
+    x=alt.X("sample_mean")
+        .bin(maxbins=20)
+        .title("Sample mean price per night (Canadian dollars)"),
+    y=alt.Y("count()").title("Count"),
 )
 
 boot_est_dist