UBC-DSCI
diff --git a/‎source/classification1.md
Lines changed: 29 additions & 31 deletions b/‎source/classification1.md
Lines changed: 29 additions & 31 deletions
diff --git a/‎source/classification2.md
Lines changed: 44 additions & 39 deletions b/‎source/classification2.md
Lines changed: 44 additions & 39 deletions
diff --git a/‎source/img/faithful_plot.png
9.66 KB b/‎source/img/faithful_plot.png
9.66 KB
diff --git a/‎source/img/faithful_plot.svg
Lines changed: 1 addition & 1 deletion b/‎source/img/faithful_plot.svg
Lines changed: 1 addition & 1 deletion
diff --git a/‎source/intro.md
Lines changed: 26 additions & 38 deletions b/‎source/intro.md
Lines changed: 26 additions & 38 deletions
@@ -290,14 +290,10 @@ perimeter and concavity variables. Recall that `altair's` default palette
 is colorblind-friendly, so we can stick with that here.
 
 ```{code-cell} ipython3
-perim_concav = (
-    alt.Chart(cancer)
-    .mark_circle()
-    .encode(
-        x=alt.X("Perimeter", title="Perimeter (standardized)"),
-        y=alt.Y("Concavity", title="Concavity (standardized)"),
-        color=alt.Color("Class", title="Diagnosis"),
-    )
+perim_concav = alt.Chart(cancer).mark_circle().encode(
+    x=alt.X("Perimeter", title="Perimeter (standardized)"),
+    y=alt.Y("Concavity", title="Concavity (standardized)"),
+    color=alt.Color("Class", title="Diagnosis"),
 )
 perim_concav
 ```
@@ -1441,14 +1437,10 @@ rare_cancer = pd.concat((
     cancer[cancer["Class"] == 'Malignant'].head(3)
 ))
 
-rare_plot = (
-    alt.Chart(rare_cancer)
-    .mark_circle()
-    .encode(
-        x=alt.X("Perimeter", title="Perimeter (standardized)"),
-        y=alt.Y("Concavity", title="Concavity (standardized)"),
-        color=alt.Color("Class", title="Diagnosis"),
-    )
+rare_plot = alt.Chart(rare_cancer).mark_circle().encode(
+    x=alt.X("Perimeter", title="Perimeter (standardized)"),
+    y=alt.Y("Concavity", title="Concavity (standardized)"),
+    color=alt.Color("Class", title="Diagnosis"),
 )
 rare_plot
 ```
@@ -1555,10 +1547,10 @@ knn.fit(X=rare_cancer.loc[:, ["Perimeter", "Concavity"]], y=rare_cancer["Class"]
 
 # create a prediction pt grid
 per_grid = np.linspace(
-    rare_cancer["Perimeter"].min(), rare_cancer["Perimeter"].max(), 50
+    rare_cancer["Perimeter"].min() * 1.05, rare_cancer["Perimeter"].max() * 1.05, 50
 )
 con_grid = np.linspace(
-    rare_cancer["Concavity"].min(), rare_cancer["Concavity"].max(), 50
+    rare_cancer["Concavity"].min() * 1.05, rare_cancer["Concavity"].max() * 1.05, 50
 )
 pcgrid = np.array(np.meshgrid(per_grid, con_grid)).reshape(2, -1).T
 pcgrid = pd.DataFrame(pcgrid, columns=["Perimeter", "Concavity"])
@@ -1594,14 +1586,16 @@ prediction_plot = (
             "Perimeter",
             title="Perimeter (standardized)",
             scale=alt.Scale(
-                domain=(rare_cancer["Perimeter"].min(), rare_cancer["Perimeter"].max())
+                domain=(rare_cancer["Perimeter"].min() * 1.05, rare_cancer["Perimeter"].max() * 1.05),
+                nice=False
             ),
         ),
         y=alt.Y(
             "Concavity",
             title="Concavity (standardized)",
             scale=alt.Scale(
-                domain=(rare_cancer["Concavity"].min(), rare_cancer["Concavity"].max())
+                domain=(rare_cancer["Concavity"].min() * 1.05, rare_cancer["Concavity"].max() * 1.05),
+                nice=False
             ),
         ),
         color=alt.Color("Class", title="Diagnosis"),
@@ -1685,14 +1679,16 @@ rare_plot = (
             "Perimeter",
             title="Perimeter (standardized)",
             scale=alt.Scale(
-                domain=(rare_cancer["Perimeter"].min(), rare_cancer["Perimeter"].max())
+                domain=(rare_cancer["Perimeter"].min() * 1.05, rare_cancer["Perimeter"].max() * 1.05),
+                nice=False
             ),
         ),
         y=alt.Y(
             "Concavity",
             title="Concavity (standardized)",
             scale=alt.Scale(
-                domain=(rare_cancer["Concavity"].min(), rare_cancer["Concavity"].max())
+                domain=(rare_cancer["Concavity"].min() * 1.05, rare_cancer["Concavity"].max() * 1.05),
+                nice=False
             ),
         ),
         color=alt.Color("Class", title="Diagnosis"),
@@ -1809,10 +1805,10 @@ import numpy as np
 
 # create the grid of area/smoothness vals, and arrange in a data frame
 are_grid = np.linspace(
-    unscaled_cancer["Area"].min(), unscaled_cancer["Area"].max(), 50
+    unscaled_cancer["Area"].min() * 0.95, unscaled_cancer["Area"].max() * 1.05, 50
 )
 smo_grid = np.linspace(
-    unscaled_cancer["Smoothness"].min(), unscaled_cancer["Smoothness"].max(), 50
+    unscaled_cancer["Smoothness"].min() * 0.95, unscaled_cancer["Smoothness"].max() * 1.05, 50
 )
 asgrid = np.array(np.meshgrid(are_grid, smo_grid)).reshape(2, -1).T
 asgrid = pd.DataFrame(asgrid, columns=["Area", "Smoothness"])
@@ -1836,17 +1832,19 @@ unscaled_plot = (
             "Area",
             title="Area",
             scale=alt.Scale(
-                domain=(unscaled_cancer["Area"].min(), unscaled_cancer["Area"].max())
-            ),
+                domain=(unscaled_cancer["Area"].min() * 0.95, unscaled_cancer["Area"].max() * 1.05),
+            nice=False
+            )
         ),
         y=alt.Y(
             "Smoothness",
             title="Smoothness",
             scale=alt.Scale(
                 domain=(
-                    unscaled_cancer["Smoothness"].min(),
-                    unscaled_cancer["Smoothness"].max(),
-                )
+                    unscaled_cancer["Smoothness"].min() * 0.95,
+                    unscaled_cancer["Smoothness"].max() * 1.05,
+                ),
+                nice=False
             ),
         ),
         color=alt.Color("Class", title="Diagnosis"),
@@ -1858,8 +1856,8 @@ prediction_plot = (
     alt.Chart(prediction_table)
     .mark_point(opacity=0.05, filled=True, size=300)
     .encode(
-        x=alt.X("Area"),
-        y=alt.Y("Smoothness"),
+        x="Area",
+        y="Smoothness",
         color=alt.Color("Class", title="Diagnosis"),
     )
 )
 
@@ -330,16 +330,11 @@ cancer['Class'] = cancer['Class'].replace({
 # create scatter plot of tumor cell concavity versus smoothness,
 # labeling the points be diagnosis class
 
-perim_concav = (
-    alt.Chart(cancer)
-    .mark_circle()
-    .encode(
-        x="Smoothness",
-        y="Concavity",
-        color=alt.Color("Class", title="Diagnosis"),
-    )
+perim_concav = alt.Chart(cancer).mark_circle().encode(
+    x=alt.X("Smoothness", scale=alt.Scale(zero=False)),
+    y="Concavity",
+    color=alt.Color("Class", title="Diagnosis"),
 )
-
 perim_concav
 ```
 
@@ -1081,19 +1076,15 @@ as shown in {numref}`fig:06-find-k`.
 ```{code-cell} ipython3
 :tags: [remove-output]
 
-accuracy_vs_k = (
-    alt.Chart(accuracies_grid)
-    .mark_line(point=True)
-    .encode(
-        x=alt.X(
-            "n_neighbors",
-            title="Neighbors",
-        ),
-        y=alt.Y(
-            "mean_test_score",
-            title="Accuracy estimate",
-            scale=alt.Scale(domain=(0.85, 0.90)),
-        ),
+accuracy_vs_k = alt.Chart(accuracies_grid).mark_line(point=True).encode(
+    x=alt.X(
+        "n_neighbors",
+        title="Neighbors",
+    ),
+    y=alt.Y(
+        "mean_test_score",
+        title="Accuracy estimate",
+        scale=alt.Scale(domain=(0.85, 0.90)),
     )
 )
 
@@ -1170,19 +1161,15 @@ large_accuracies_grid = pd.DataFrame(
                     ).cv_results_
                   )
 
-large_accuracy_vs_k = (
-    alt.Chart(large_accuracies_grid)
-    .mark_line(point=True)
-    .encode(
-        x=alt.X(
-            "param_kneighborsclassifier__n_neighbors",
-            title="Neighbors",
-        ),
-        y=alt.Y(
-            "mean_test_score",
-            title="Accuracy estimate",
-            scale=alt.Scale(domain=(0.60, 0.90)),
-        ),
+large_accuracy_vs_k = alt.Chart(large_accuracies_grid).mark_line(point=True).encode(
+    x=alt.X(
+        "param_kneighborsclassifier__n_neighbors",
+        title="Neighbors",
+    ),
+    y=alt.Y(
+        "mean_test_score",
+        title="Accuracy estimate",
+        scale=alt.Scale(domain=(0.60, 0.90)),
     )
 )
 
@@ -1269,10 +1256,10 @@ y = cancer_train["Class"]
 
 # create a prediction pt grid
 smo_grid = np.linspace(
-    cancer_train["Smoothness"].min(), cancer_train["Smoothness"].max(), 100
+    cancer_train["Smoothness"].min() * 0.95, cancer_train["Smoothness"].max() * 1.05, 100
 )
 con_grid = np.linspace(
-    cancer_train["Concavity"].min(), cancer_train["Concavity"].max(), 100
+    cancer_train["Concavity"].min() - 0.025, cancer_train["Concavity"].max() * 1.05, 100
 )
 scgrid = np.array(np.meshgrid(smo_grid, con_grid)).reshape(2, -1).T
 scgrid = pd.DataFrame(scgrid, columns=["Smoothness", "Concavity"])
@@ -1294,8 +1281,26 @@ for k in [1, 7, 20, 300]:
         )
         .mark_point(opacity=0.2, filled=True, size=20)
         .encode(
-            x=alt.X("Smoothness"),
-            y=alt.Y("Concavity"),
+            x=alt.X(
+                "Smoothness",
+                scale=alt.Scale(
+                    domain=(
+                        cancer_train["Smoothness"].min() * 0.95,
+                        cancer_train["Smoothness"].max() * 1.05
+                    ),
+                    nice=False
+                )
+            ),
+            y=alt.Y(
+                "Concavity",
+                scale=alt.Scale(
+                    domain=(
+                        cancer_train["Concavity"].min() -0.025,
+                        cancer_train["Concavity"].max() * 1.05
+                    ),
+                    nice=False
+                )
+            ),
             color=alt.Color("Class", title="Diagnosis"),
         )
     )
 
@@ -804,13 +804,12 @@ import altair as alt
 
 +++
 
-The fundamental object in `altair` is the `Chart`, which takes a data frame as a single argument: `alt.Chart(ten_lang)`.
+The fundamental object in `altair` is the `Chart`, which takes a data frame as an argument: `alt.Chart(ten_lang)`.
 With a chart object in hand, we can now specify how we would like the data to be visualized.
-We first indicate what kind of geometric mark we want to use to represent the data. Here we set the mark attribute
+We first indicate what kind of graphical *mark* we want to use to represent the data. Here we set the mark attribute
 of the chart object using the `Chart.mark_bar` function, because we want to create a bar chart.
-Next, we need to encode the variables of the data frame using
-the `x` (represents the x-axis position of the points) and
-`y` (represents the y-axis position of the points) *channels*. We use the `encode()`
+Next, we need to *encode* the variables of the data frame using
+the `x` and `y` *channels* (which represent the x-axis and y-axis position of the points). We use the `encode()`
 function to handle this: we specify that the `language` column should correspond to the x-axis,
 and that the `mother_tongue` column should correspond to the y-axis.
 
@@ -853,7 +852,7 @@ Bar plot of the ten Aboriginal languages most often reported by Canadian residen
 ```{index} see: .; chaining methods
 ```
 
-### Formatting `altair` objects
+### Formatting `altair` charts
 
 It is exciting that we can already visualize our data to help answer our
 question, but we are not done yet! We can (and should) do more to improve the
@@ -865,28 +864,27 @@ example above, Python uses the column name `mother_tongue` as the label for the
 y axis, but most people will not know what that is. And even if they did, they
 will not know how we measured this variable, or the group of people on which the
 measurements were taken. An axis label that reads "Mother Tongue (Number of
-Canadian Residents)" would be much more informative.
+Canadian Residents)" would be much more informative. To make the code easier to
+read, we're spreading it out over multiple lines just as we did in the previous
+section with pandas.
 
 ```{index} plot; labels, plot; axis labels
 ```
 
 Adding additional labels to our visualizations that we create in `altair` is
 one common and easy way to improve and refine our data visualizations. We can add titles for the axes
 in the `altair` objects using `alt.X` and `alt.Y` with the `title` argument to make
-the axes titles more informative.
+the axes titles more informative (you will learn more about `alt.X` and `alt.Y` in the {ref}`viz` chapter).
 Again, since we are specifying
 words (e.g. `"Mother Tongue (Number of Canadian Residents)"`) as arguments to
 `alt.X` and `alt.Y`, we surround them with double quotation marks. We can do many other modifications
 to format the plot further, and we will explore these in the {ref}`viz` chapter.
 
 ```{code-cell} ipython3
-barplot_mother_tongue = (
-    alt.Chart(ten_lang)
-    .mark_bar().encode(
-        x=alt.X('language', title='Language'),
-        y=alt.Y('mother_tongue', title='Mother Tongue (Number of Canadian Residents)')
-    ))
-
+barplot_mother_tongue = alt.Chart(ten_lang).mark_bar().encode(
+    x=alt.X('language', title='Language'),
+    y=alt.Y('mother_tongue', title='Mother Tongue (Number of Canadian Residents)')
+)
 ```
 
 
@@ -915,13 +913,10 @@ To accomplish this, we will swap the x and y coordinate axes:
 
 
 ```{code-cell} ipython3
-barplot_mother_tongue_axis = (
-    alt.Chart(ten_lang)
-    .mark_bar().encode(
-        x=alt.X('mother_tongue', title='Mother Tongue (Number of Canadian Residents)'),
-        y=alt.Y('language', title='Language')
-    ))
-
+barplot_mother_tongue_axis = alt.Chart(ten_lang).mark_bar().encode(
+    x=alt.X('mother_tongue', title='Mother Tongue (Number of Canadian Residents)'),
+    y=alt.Y('language', title='Language')
+)
 ```
 
 ```{code-cell} ipython3
@@ -951,13 +946,10 @@ the `sort` argument, which orders a variable (here `language`) based on the
 values of the variable(`mother_tongue`) on the `x-axis`.
 
 ```{code-cell} ipython3
-ordered_barplot_mother_tongue = (
-    alt.Chart(ten_lang)
-    .mark_bar().encode(
-        x=alt.X('mother_tongue', title='Mother Tongue (Number of Canadian Residents)'),
-        y=alt.Y('language', sort='x', title='Language')
-    ))
-
+ordered_barplot_mother_tongue = alt.Chart(ten_lang).mark_bar().encode(
+    x=alt.X('mother_tongue', title='Mother Tongue (Number of Canadian Residents)'),
+    y=alt.Y('language', sort='x', title='Language')
+)
 ```
 
 +++
@@ -1028,17 +1020,13 @@ ten_lang = (
     can_lang.loc[can_lang["category"] == "Aboriginal languages", ["language", "mother_tongue"]]
     .sort_values(by="mother_tongue", ascending=False)
     .head(10)
-    )
+)
 
 # create the visualization
-ten_lang_plot = (
-    alt.Chart(ten_lang)
-    .mark_bar().encode(
-        x=alt.X('mother_tongue', title='Mother Tongue (Number of Canadian Residents)'),
-        y=alt.Y('language', sort='x', title='Language')
-    ))
-
-
+ten_lang_plot = alt.Chart(ten_lang).mark_bar().encode(
+    x=alt.X('mother_tongue', title='Mother Tongue (Number of Canadian Residents)'),
+    y=alt.Y('language', sort='x', title='Language')
+)
 ```
 
 ```{code-cell} ipython3