diff --git a/Dockerfile b/Dockerfile
index 8e57cda2..1a31ee60 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -18,6 +18,7 @@ RUN mamba install --quiet --yes \
'numpy' \
'jinja2' \
'altair_data_server' \
+ 'altair_saver' \
'click' \
'ibis-framework' \
'ghp-import' \
diff --git a/build_html.sh b/build_html.sh
index a7a3f798..f68c05c9 100755
--- a/build_html.sh
+++ b/build_html.sh
@@ -1,2 +1,2 @@
chmod -R o+w source/
-docker run --rm -v $(pwd):/home/jovyan ubcdsci/py-intro-to-ds:202212191809333bdc71 /bin/bash -c "jupyter-book build source"
+docker run --rm -v $(pwd):/home/jovyan ubcdsci/py-intro-to-ds:20230104230634037f38 /bin/bash -c "jupyter-book build source"
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 7e821e45..00000000
--- a/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-jupyter-book
-matplotlib
-numpy
diff --git a/source/_config.yml b/source/_config.yml
index 93a176b0..779e497b 100644
--- a/source/_config.yml
+++ b/source/_config.yml
@@ -1,11 +1,7 @@
-#######################################################################################
-# Config file for EOSC211 jupyter book
-#######################################################################################
# Book settings
-
-title: DSCĪ 100
+title: "Data Science: A First Introduction (Python Edition)"
author: UBC
-copyright: "2021" # Copyright year to be placed in the footer
+copyright: "2022" # Copyright year to be placed in the footer
logo: "" # A path to the book logo
# Patterns to skip when building the book. Can be glob-style (e.g. "*skip.ipynb")
exclude_patterns: [_build, Thumbs.db, .DS_Store, "*.ipynb_checkpoints"]
@@ -15,10 +11,10 @@ only_build_toc_files: true
#######################################################################################
# Execution settings
execute:
- execute_notebooks: "cache" # Whether to execute notebooks at build time. Must be one of ("auto", "force", "cache", "off")
+ execute_notebooks: "auto" # Whether to execute notebooks at build time. Must be one of ("auto", "force", "cache", "off")
cache: "" # A path to the jupyter cache that will be used to store execution artifacts. Defaults to `_build/.jupyter_cache/`
# exclude_patterns: [] # A list of patterns to *skip* in execution (e.g. a notebook that takes a really long time)
- timeout: 30 # The maximum time (in seconds) each notebook cell is allowed to run.
+ timeout: 90 # The maximum time (in seconds) each notebook cell is allowed to run.
run_in_temp:
false # If `True`, then a temporary directory will be created and used as the command working directory (cwd),
# otherwise the notebook's parent directory will be the cwd.
@@ -65,19 +61,15 @@ latex:
latex_engine: pdflatex # one of 'pdflatex', 'xelatex' (recommended for unicode), 'luatex', 'platex', 'uplatex'
use_jupyterbook_latex: true # use sphinx-jupyterbook-latex for pdf builds as default
-
#######################################################################################
-
# Launch button settings
launch_buttons:
binderhub_url: ""
-
-
repository:
- url: https://github.com/phaustin/eosc211_students # The URL to your book's repository
- path_to_book: "" # A path to your book's folder, relative to the repository root.
- branch: e211_live_main # Which branch of the repository should be used when creating links
+ url: https://github.com/UBC-DSCI/introduction-to-datascience-python # The URL to your book's repository
+ path_to_book: "source" # A path to your book's folder, relative to the repository root.
+ branch: production # Which branch of the repository should be used when creating links
#######################################################################################
# Advanced and power-user settings
diff --git a/source/_toc.yml b/source/_toc.yml
index 61dfc676..58497d23 100644
--- a/source/_toc.yml
+++ b/source/_toc.yml
@@ -1,15 +1,18 @@
format: jb-book
root: index.md
-options:
- numbered: true
parts:
-- caption: First draft
+- caption: Front Matter
chapters:
- file: preface-text.md
- - file: foreword-text.md
+ #- file: foreword.md
- file: acknowledgements.md
+ - file: acknowledgements-python.md
- file: authors.md
- - file: setup.md
+ - file: editors.md
+ #- file: setup.md
+- caption: Chapters
+ numbered: 3
+ chapters:
- file: intro.md
- file: reading.md
- file: wrangling.md
@@ -20,5 +23,7 @@ parts:
- file: regression2.md
- file: clustering.md
- file: inference.md
- - file: references.md
+- caption: Appendix
+ chapters:
- file: appendixA.md
+ #- file: references.md
diff --git a/source/acknowledgements-python.md b/source/acknowledgements-python.md
new file mode 100644
index 00000000..dc687718
--- /dev/null
+++ b/source/acknowledgements-python.md
@@ -0,0 +1,25 @@
+---
+jupytext:
+ cell_metadata_filter: -all
+ formats: py:percent,md:myst,ipynb
+ text_representation:
+ extension: .md
+ format_name: myst
+ format_version: 0.13
+ jupytext_version: 1.13.8
+kernelspec:
+ display_name: Python 3 (ipykernel)
+ language: python
+ name: python3
+---
+
+# Acknowledgments for the Python Edition
+
+We'd like to thank everyone that has contributed to the development of
+[*Data Science: A First Introduction (Python Edition)*](https://ubc-dsci.github.io/introduction-to-datascience-python/).
+This is an open source Python translation of the original [*Data Science: A First Introduction*](https://datasciencebook.ca);
+the original focused on the R programming language. Both of these books are
+used to teach DSCI 100, a new introductory data science course
+at the University of British Columbia (UBC).
+
+We will finalize this acknowledgements section after the book is complete!
diff --git a/source/acknowledgements.md b/source/acknowledgements.md
index e0ec1699..82ecc5c7 100644
--- a/source/acknowledgements.md
+++ b/source/acknowledgements.md
@@ -13,7 +13,7 @@ kernelspec:
name: python3
---
-# Acknowledgments -- TBD
+# Acknowledgments
We'd like to thank everyone that has contributed to the development of
[*Data Science: A First Introduction*](https://datasciencebook.ca).
diff --git a/source/appendixA.md b/source/appendixA.md
index a1e1bcc3..7e57bf72 100644
--- a/source/appendixA.md
+++ b/source/appendixA.md
@@ -13,9 +13,7 @@ kernelspec:
name: python3
---
-# Appendix
-
-# Downloading files from JupyterHub {#appendixA}
+# Downloading files from JupyterHub
This section will help you
save your work from a JupyterHub web-based platform to your own computer.
diff --git a/source/authors.md b/source/authors.md
index b6465c76..7e6dc803 100644
--- a/source/authors.md
+++ b/source/authors.md
@@ -13,7 +13,7 @@ kernelspec:
name: python3
---
-# About the authors -- TBD
+# About the authors
**Tiffany Timbers** is an Assistant Professor of Teaching in the Department of
Statistics and Co-Director for the Master of Data Science program (Vancouver
diff --git a/source/editors.md b/source/editors.md
new file mode 100644
index 00000000..dedb5171
--- /dev/null
+++ b/source/editors.md
@@ -0,0 +1,51 @@
+---
+jupytext:
+ cell_metadata_filter: -all
+ formats: py:percent,md:myst,ipynb
+ text_representation:
+ extension: .md
+ format_name: myst
+ format_version: 0.13
+ jupytext_version: 1.13.8
+kernelspec:
+ display_name: Python 3 (ipykernel)
+ language: python
+ name: python3
+---
+
+# About the editors of the Python Edition
+
+**Trevor Campbell** is an Assistant Professor in the Department of Statistics at
+the University of British Columbia. His research focuses on automated, scalable
+Bayesian inference algorithms, Bayesian nonparametrics, streaming data, and
+Bayesian theory. He was previously a postdoctoral associate advised by Tamara
+Broderick in the Computer Science and Artificial Intelligence Laboratory
+(CSAIL) and Institute for Data, Systems, and Society (IDSS) at MIT, a Ph.D.
+candidate under Jonathan How in the Laboratory for Information and Decision
+Systems (LIDS) at MIT, and before that he was in the Engineering Science
+program at the University of Toronto.
+
++++
+
+**Lindsey Heagy** is an Assistant Professor in the Department of Earth, Ocean, and Atmospheric
+Sciences and director of the Geophysical Inversion Facility at the University of British Columbia.
+Her research combines computational methods in numerical simulations, inversions, and machine
+learning to answer questions about the subsurface of the Earth. Primary applications include
+mineral exploration, carbon sequestration, groundwater and environmental studies. She
+completed her BSc at the University of Alberta, her PhD at the University of British Columbia,
+and held a Postdoctoral research position at the University of California Berkeley prior to
+starting her current position at UBC.
+
++++
+
+**Joel Ostblom** is an Assistant Professor of Teaching in the Department of
+Statistics at the University of British Columbia.
+During his PhD, Joel developed a passion for data science and reproducibility
+through the development of quantitative image analysis pipelines for studying
+stem cell and developmental biology. He has since co-created or lead the
+development of several courses and workshops at the University of Toronto and
+is now an assistant professor of teaching in the statistics department at the
+University of British Columbia. Joel cares deeply about spreading data literacy
+and excitement over programmatic data analysis, which is reflected in his
+contributions to open source projects and data science learning resources. You
+can read more about Joel on his [personal page](https://joelostblom.com/).
diff --git a/source/img/altair_syntax.png b/source/img/altair_syntax.png
new file mode 100644
index 00000000..55676cdb
Binary files /dev/null and b/source/img/altair_syntax.png differ
diff --git a/source/img/code-figures.pptx b/source/img/code-figures.pptx
new file mode 100644
index 00000000..e671a57b
Binary files /dev/null and b/source/img/code-figures.pptx differ
diff --git a/source/img/completion_menu.png b/source/img/completion_menu.png
new file mode 100644
index 00000000..1de73d77
Binary files /dev/null and b/source/img/completion_menu.png differ
diff --git a/source/img/data_frame_slides_cdn/data_frame_slides_cdn.001.jpeg b/source/img/data_frame_slides_cdn/data_frame_slides_cdn.001.jpeg
index fa1e065d..276300cc 100644
Binary files a/source/img/data_frame_slides_cdn/data_frame_slides_cdn.001.jpeg and b/source/img/data_frame_slides_cdn/data_frame_slides_cdn.001.jpeg differ
diff --git a/source/img/data_frame_slides_cdn/data_frame_slides_cdn.002.jpeg b/source/img/data_frame_slides_cdn/data_frame_slides_cdn.002.jpeg
index 4fcf0966..b29831ee 100644
Binary files a/source/img/data_frame_slides_cdn/data_frame_slides_cdn.002.jpeg and b/source/img/data_frame_slides_cdn/data_frame_slides_cdn.002.jpeg differ
diff --git a/source/img/data_frame_slides_cdn/data_frame_slides_cdn.004.jpeg b/source/img/data_frame_slides_cdn/data_frame_slides_cdn.004.jpeg
index ae68f0d2..8675de1e 100644
Binary files a/source/img/data_frame_slides_cdn/data_frame_slides_cdn.004.jpeg and b/source/img/data_frame_slides_cdn/data_frame_slides_cdn.004.jpeg differ
diff --git a/source/img/faithful_plot.png b/source/img/faithful_plot.png
index fa93f603..a0e986de 100644
Binary files a/source/img/faithful_plot.png and b/source/img/faithful_plot.png differ
diff --git a/source/img/faithful_plot.svg b/source/img/faithful_plot.svg
index cf6ae779..21282faf 100644
--- a/source/img/faithful_plot.svg
+++ b/source/img/faithful_plot.svg
@@ -1,346 +1 @@
-
-
+
\ No newline at end of file
diff --git a/source/img/filter_rows.png b/source/img/filter_rows.png
new file mode 100644
index 00000000..5d15ca4f
Binary files /dev/null and b/source/img/filter_rows.png differ
diff --git a/source/img/filter_rows_and_columns.png b/source/img/filter_rows_and_columns.png
new file mode 100644
index 00000000..124a7dc4
Binary files /dev/null and b/source/img/filter_rows_and_columns.png differ
diff --git a/source/img/help_dialog.png b/source/img/help_dialog.png
new file mode 100644
index 00000000..c2197ab7
Binary files /dev/null and b/source/img/help_dialog.png differ
diff --git a/source/img/pivot_functions/pivot_functions.001.jpeg b/source/img/pivot_functions/pivot_functions.001.jpeg
index f72151ba..fc5123f3 100644
Binary files a/source/img/pivot_functions/pivot_functions.001.jpeg and b/source/img/pivot_functions/pivot_functions.001.jpeg differ
diff --git a/source/img/pivot_functions/pivot_functions.002.jpeg b/source/img/pivot_functions/pivot_functions.002.jpeg
index 5e83772e..961c0813 100644
Binary files a/source/img/pivot_functions/pivot_functions.002.jpeg and b/source/img/pivot_functions/pivot_functions.002.jpeg differ
diff --git a/source/img/read_csv_function.png b/source/img/read_csv_function.png
new file mode 100644
index 00000000..4593eaa9
Binary files /dev/null and b/source/img/read_csv_function.png differ
diff --git a/source/img/select_columns.png b/source/img/select_columns.png
new file mode 100644
index 00000000..f316180d
Binary files /dev/null and b/source/img/select_columns.png differ
diff --git a/source/img/sort_values.png b/source/img/sort_values.png
new file mode 100644
index 00000000..770ce22d
Binary files /dev/null and b/source/img/sort_values.png differ
diff --git a/source/img/summarize/summarize.001.jpeg b/source/img/summarize/summarize.001.jpeg
index 1ffbaa57..7960e61e 100644
Binary files a/source/img/summarize/summarize.001.jpeg and b/source/img/summarize/summarize.001.jpeg differ
diff --git a/source/img/summarize/summarize.002.jpeg b/source/img/summarize/summarize.002.jpeg
index 5a6dbbd0..97995520 100644
Binary files a/source/img/summarize/summarize.002.jpeg and b/source/img/summarize/summarize.002.jpeg differ
diff --git a/source/img/summarize/summarize.003.jpeg b/source/img/summarize/summarize.003.jpeg
index a9d50b07..0a97f6be 100644
Binary files a/source/img/summarize/summarize.003.jpeg and b/source/img/summarize/summarize.003.jpeg differ
diff --git a/source/img/summarize/summarize.004.jpeg b/source/img/summarize/summarize.004.jpeg
index f3553dba..476ad698 100644
Binary files a/source/img/summarize/summarize.004.jpeg and b/source/img/summarize/summarize.004.jpeg differ
diff --git a/source/img/summarize/summarize.005.jpeg b/source/img/summarize/summarize.005.jpeg
index b2b1b2ca..d1a4f710 100644
Binary files a/source/img/summarize/summarize.005.jpeg and b/source/img/summarize/summarize.005.jpeg differ
diff --git a/source/img/wrangling/pandas_dataframe_series-3.png b/source/img/wrangling/pandas_dataframe_series-3.png
index a93bf397..6a2eea54 100644
Binary files a/source/img/wrangling/pandas_dataframe_series-3.png and b/source/img/wrangling/pandas_dataframe_series-3.png differ
diff --git a/source/img/wrangling/pandas_dataframe_series.png b/source/img/wrangling/pandas_dataframe_series.png
index 285a6559..75ffc893 100644
Binary files a/source/img/wrangling/pandas_dataframe_series.png and b/source/img/wrangling/pandas_dataframe_series.png differ
diff --git a/source/img/wrangling/pandas_melt_args_labels.png b/source/img/wrangling/pandas_melt_args_labels.png
index a1f9bd98..a24eb439 100644
Binary files a/source/img/wrangling/pandas_melt_args_labels.png and b/source/img/wrangling/pandas_melt_args_labels.png differ
diff --git a/source/img/wrangling/pandas_melt_wide-long.png b/source/img/wrangling/pandas_melt_wide-long.png
index 994e32a7..03b30975 100644
Binary files a/source/img/wrangling/pandas_melt_wide-long.png and b/source/img/wrangling/pandas_melt_wide-long.png differ
diff --git a/source/img/wrangling/pandas_pivot_args_labels.png b/source/img/wrangling/pandas_pivot_args_labels.png
index 7d57644c..0f961aaf 100644
Binary files a/source/img/wrangling/pandas_pivot_args_labels.png and b/source/img/wrangling/pandas_pivot_args_labels.png differ
diff --git a/source/img/wrangling/pandas_pivot_long-wide.png b/source/img/wrangling/pandas_pivot_long-wide.png
index 994e0510..faff307b 100644
Binary files a/source/img/wrangling/pandas_pivot_long-wide.png and b/source/img/wrangling/pandas_pivot_long-wide.png differ
diff --git a/source/index.md b/source/index.md
index 304a3606..be402176 100644
--- a/source/index.md
+++ b/source/index.md
@@ -13,19 +13,21 @@ kernelspec:
name: python3
---
-# Welcome -- TBD
+# Welcome!
-This is the [website](https://datasciencebook.ca/) for *Data Science: A First Introduction*.
+This is the [website](https://ubc-dsci.github.io/introduction-to-datascience-python/) for *Data Science: A First Introduction (Python Edition)*.
You can read the web version of the book on this site. Click a section in the table of contents
on the left side of the page to navigate to it. If you are on a mobile device,
-you may need to open the table of contents first by clicking the menu button on
+you may need to open the table of contents first by clicking the menu button on
the top left of the page.
-You can purchase a PDF or print copy of the book
-on the [CRC Press website](https://www.routledge.com/Data-Science-A-First-Introduction/Timbers-Campbell-Lee/p/book/9780367524685) or on [Amazon](https://www.amazon.com/Data-Science-First-Introduction-Chapman/dp/0367532174/ref=sr_[ā¦]qid=1644637450&sprefix=data+science+timber%2Caps%2C166&sr=8-1).
+
+For the R version of the textbook, please visit https://datasciencebook.ca.
+You can purchase a PDF or print copy of the R version of the book
+on the [CRC Press website](https://www.routledge.com/Data-Science-A-First-Introduction/Timbers-Campbell-Lee/p/book/9780367524685) or
+on [Amazon](https://www.amazon.com/Data-Science-First-Introduction-Chapman/dp/0367532174/ref=sr_[ā¦]qid=1644637450&sprefix=data+science+timber%2Caps%2C166&sr=8-1).
-This work by [Tiffany Timbers](https://www.tiffanytimbers.com/), [Trevor Campbell](https://trevorcampbell.me/),
-and [Melissa Lee](https://www.stat.ubc.ca/users/melissa-lee) is licensed under
+This work by [Tiffany Timbers](https://www.tiffanytimbers.com/), [Trevor Campbell](https://trevorcampbell.me/),
+and [Melissa Lee](https://www.stat.ubc.ca/users/melissa-lee) is licensed under
a [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License](http://creativecommons.org/licenses/by-nc-sa/4.0/).
-
diff --git a/source/intro.md b/source/intro.md
index 9683b4ef..bad9f768 100644
--- a/source/intro.md
+++ b/source/intro.md
@@ -24,9 +24,9 @@ from myst_nb import glue
This chapter provides an introduction to data science and the Python programming language.
The goal here is to get your hands dirty right from the start! We will walk through an entire data analysis,
-and along the way introduce different types of data analysis question, some fundamental programming
+and along the way introduce different types of data analysis question, some fundamental programming
concepts in Python, and the basics of loading, cleaning, and visualizing data. In the following chapters, we will
-dig into each of these steps in much more detail; but for now, let's jump in to see how much we can do
+dig into each of these steps in much more detail; but for now, let's jump in to see how much we can do
with data science!
## Chapter learning objectives
@@ -38,7 +38,8 @@ By the end of the chapter, readers will be able to do the following:
- Read tabular data with `read_csv`.
- Use `help()` to access help and documentation tools in Python.
- Create new variables and objects in Python.
-- Create and organize subsets of tabular data using `[]`, `loc[]`, and `sort_values`
+- Create and organize subsets of tabular data using `[]`, `loc[]`, and `sort_values`.
+- Chain multiple operations in sequence.
- Visualize data with an `altair` bar plot.
## Canadian languages data set
@@ -47,7 +48,7 @@ By the end of the chapter, readers will be able to do the following:
```
In this chapter, we will walk through a full analysis of a data set relating to
-languages spoken at home by Canadian residents. Many Indigenous peoples exist in Canada
+languages spoken at home by Canadian residents. Many Indigenous peoples exist in Canada
with their own cultures and languages; these languages are often unique to Canada and not spoken
anywhere else in the world {cite:p}`statcan2018mothertongue`. Sadly, colonization has
led to the loss of many of these languages. For instance, generations of
@@ -55,18 +56,18 @@ children were not allowed to speak their mother tongue (the first language an
individual learns in childhood) in Canadian residential schools. Colonizers
also renamed places they had "discovered" {cite:p}`wilson2018`. Acts such as these
have significantly harmed the continuity of Indigenous languages in Canada, and
-some languages are considered "endangered" as few people report speaking them.
-To learn more, please see *Canadian Geographic*'s article, "Mapping Indigenous Languages in
-Canada" {cite:p}`walker2017`,
-*They Came for the Children: Canada, Aboriginal
-peoples, and Residential Schools* {cite:p}`children2012`
-and the *Truth and Reconciliation Commission of Canada's*
+some languages are considered "endangered" as few people report speaking them.
+To learn more, please see *Canadian Geographic*'s article, "Mapping Indigenous Languages in
+Canada" {cite:p}`walker2017`,
+*They Came for the Children: Canada, Aboriginal
+peoples, and Residential Schools* {cite:p}`children2012`
+and the *Truth and Reconciliation Commission of Canada's*
*Calls to Action* {cite:p}`calls2015`.
-The data set we will study in this chapter is taken from
-[the `canlang` R data package](https://ttimbers.github.io/canlang/)
+The data set we will study in this chapter is taken from
+[the `canlang` R data package](https://ttimbers.github.io/canlang/)
{cite:p}`timbers2020canlang`, which has
-population language data collected during the 2016 Canadian census {cite:p}`cancensus2016`.
+population language data collected during the 2016 Canadian census {cite:p}`cancensus2016`.
In this data, there are 214 languages recorded, each having six different properties:
1. `category`: Higher-level language category, describing whether the language is an Official Canadian language, an Aboriginal (i.e., Indigenous) language, or a Non-Official and Non-Aboriginal language.
@@ -78,15 +79,15 @@ In this data, there are 214 languages recorded, each having six different proper
According to the census, more than 60 Aboriginal languages were reported
as being spoken in Canada. Suppose we want to know which are the most common;
-then we might ask the following question, which we wish to answer using our data:
+then we might ask the following question, which we wish to answer using our data:
*Which ten Aboriginal languages were most often reported in 2016 as mother
-tongues in Canada, and how many people speak each of them?*
+tongues in Canada, and how many people speak each of them?*
```{index} data science; good practices
```
-> **Note:** Data science cannot be done without
+> **Note:** Data science cannot be done without
> a deep understanding of the data and
> problem domain. In this book, we have simplified the data sets used in our
> examples to concentrate on methods and fundamental concepts. But in real
@@ -96,15 +97,15 @@ tongues in Canada, and how many people speak each of them?*
> about *how* the data were collected, which affects the conclusions you can
> draw. If your data are biased, then your results will be biased!
-## Asking a question
+## Asking a question
Every good data analysis begins with a *question*—like the
above—that you aim to answer using data. As it turns out, there
are actually a number of different *types* of question regarding data:
descriptive, exploratory, inferential, predictive, causal, and mechanistic,
all of which are defined in {numref}`questions-table`. {cite:p}`leek2015question,peng2015art`
-Carefully formulating a question as early as possible in your analysis—and
-correctly identifying which type of question it is—will guide your overall approach to
+Carefully formulating a question as early as possible in your analysis—and
+correctly identifying which type of question it is—will guide your overall approach to
the analysis as well as the selection of appropriate tools.
```{index} question; data analysis, descriptive question; definition, exploratory question; definition
@@ -138,12 +139,12 @@ the analysis as well as the selection of appropriate tools.
* - Mechanistic
- A question that asks about the underlying mechanism of the observed patterns, trends, or relationships (i.e., how does it happen?)
- How does wealth lead to voting for a certain political party in Canadian elections?
-
+
```
-In this book, you will learn techniques to answer the
-first four types of question: descriptive, exploratory, predictive, and inferential;
+In this book, you will learn techniques to answer the
+first four types of question: descriptive, exploratory, predictive, and inferential;
causal and mechanistic questions are beyond the scope of this book.
In particular, you will learn how to apply the following analysis tools:
@@ -153,25 +154,25 @@ In particular, you will learn how to apply the following analysis tools:
```{index} clustering; overview, estimation; overview
```
-1. **Summarization:** computing and reporting aggregated values pertaining to a data set.
+1. **Summarization:** computing and reporting aggregated values pertaining to a data set.
Summarization is most often used to answer descriptive questions,
and can occasionally help with answering exploratory questions.
-For example, you might use summarization to answer the following question:
+For example, you might use summarization to answer the following question:
*What is the average race time for runners in this data set?*
Tools for summarization are covered in detail in the {ref}`reading`
and {ref}`wrangling` chapters, but appear regularly throughout the text.
-1. **Visualization:** plotting data graphically.
+1. **Visualization:** plotting data graphically.
Visualization is typically used to answer descriptive and exploratory questions,
but plays a critical supporting role in answering all of the types of question in {numref}`questions-table`.
For example, you might use visualization to answer the following question:
-*Is there any relationship between race time and age for runners in this data set?*
+*Is there any relationship between race time and age for runners in this data set?*
This is covered in detail in the {ref}`viz` chapter, but again appears regularly throughout the book.
3. **Classification:** predicting a class or category for a new observation.
Classification is used to answer predictive questions.
For example, you might use classification to answer the following question:
*Given measurements of a tumor's average cell area and perimeter, is the tumor benign or malignant?*
Classification is covered in the {ref}`classification` and {ref}`classification2` chapters.
-4. **Regression:** predicting a quantitative value for a new observation.
+4. **Regression:** predicting a quantitative value for a new observation.
Regression is also used to answer predictive questions.
For example, you might use regression to answer the following question:
*What will be the race time for a 20-year-old runner who weighs 50kg?*
@@ -181,22 +182,22 @@ data set. Clustering is often used to answer exploratory questions.
For example, you might use clustering to answer the following question:
*What products are commonly bought together on Amazon?*
Clustering is covered in the {ref}`clustering` chapter.
-6. **Estimation:** taking measurements for a small number of items from a large group
- and making a good guess for the average or proportion for the large group. Estimation
+6. **Estimation:** taking measurements for a small number of items from a large group
+ and making a good guess for the average or proportion for the large group. Estimation
is used to answer inferential questions.
For example, you might use estimation to answer the following question:
*Given a survey of cellphone ownership of 100 Canadians, what proportion
-of the entire Canadian population own Android phones?*
+of the entire Canadian population own Android phones?*
Estimation is covered in the {ref}`inference` chapter.
-Referring to {numref}`questions-table`, our question about
+Referring to {numref}`questions-table`, our question about
Aboriginal languages is an example of a *descriptive question*: we are
summarizing the characteristics of a data set without further interpretation.
And referring to the list above, it looks like we should use visualization
and perhaps some summarization to answer the question. So in the remainder
-of this chapter, we will work towards making a visualization that shows
+of this chapter, we will work towards making a visualization that shows
us the ten most common Aboriginal languages in Canada and their associated counts,
-according to the 2016 census.
+according to the 2016 census.
## Loading a tabular data set
@@ -204,7 +205,7 @@ according to the 2016 census.
```
A data set is, at its core essence, a structured collection of numbers and characters.
-Aside from that, there are really no strict rules; data sets can come in
+Aside from that, there are really no strict rules; data sets can come in
many different forms! Perhaps the most common form of data set that you will
find in the wild, however, is *tabular data*. Think spreadsheets in Microsoft Excel: tabular data are
rectangular-shaped and spreadsheet-like, as shown in {numref}`img-spreadsheet-vs-data frame`. In this book, we will focus primarily on tabular data.
@@ -216,14 +217,14 @@ Since we are using Python for data analysis in this book, the first step for us
load the data into Python. When we load tabular data into
Python, it is represented as a *data frame* object. {numref}`img-spreadsheet-vs-data frame` shows that a Python data frame is very similar
to a spreadsheet. We refer to the rows as **observations**; these are the things that we
-collect the data on, e.g., voters, cities, etc. We refer to the columns as
+collect the data on, e.g., voters, cities, etc. We refer to the columns as
**variables**; these are the characteristics of those observations, e.g., voters' political
-affiliations, cities' populations, etc.
+affiliations, cities' populations, etc.
```{figure} img/spreadsheet_vs_df.png
---
-height: 400px
+height: 500px
name: img-spreadsheet-vs-data frame
---
A spreadsheet versus a data frame in Python
@@ -239,7 +240,7 @@ The first kind of data file that we will learn how to load into Python as a data
frame is the *comma-separated values* format (`.csv` for short). These files
have names ending in `.csv`, and can be opened and saved using common
spreadsheet programs like Microsoft Excel and Google Sheets. For example, the
-`.csv` file named `can_lang.csv`
+`.csv` file named `can_lang.csv`
is included with [the code for this book](https://github.com/UBC-DSCI/introduction-to-datascience-python/tree/main/source/data).
If we were to open this data in a plain text editor (a program like Notepad that just shows
text with no formatting), we would see each row on its own line, and each entry in the table separated by a comma:
@@ -264,7 +265,7 @@ To load this data into Python so that we can do things with it (e.g., perform
analyses or create data visualizations), we will need to use a *function.* A
function is a special word in Python that takes instructions (we call these
*arguments*) and does something. The function we will use to load a `.csv` file
-into Python is called `read_csv`. In its most basic
+into Python is called `read_csv`. In its most basic
use-case, `read_csv` expects that the data file:
- has column names (or *headers*),
@@ -280,14 +281,14 @@ Below you'll see the code used to load the data into Python using the `read_csv`
function. Note that the `read_csv` function is not included in the base
installation of Python, meaning that it is not one of the primary functions ready to
use when you install Python. Therefore, you need to load it from somewhere else
-before you can use it. The place from which we will load it is called a Python *package*.
+before you can use it. The place from which we will load it is called a Python *package*.
A Python package is a collection of functions that can be used in addition to the
built-in Python package functions once loaded. The `read_csv` function, in
-particular, can be made accessible by loading
+particular, can be made accessible by loading
[the `pandas` Python package](https://pypi.org/project/pandas/) {cite:p}`reback2020pandas,mckinney-proc-scipy-2010`
using the `import` command. The `pandas` package contains many
-functions that we will use throughout this book to load, clean, wrangle,
-and visualize data.
+functions that we will use throughout this book to load, clean, wrangle,
+and visualize data.
+++
@@ -296,25 +297,23 @@ import pandas as pd
```
This command has two parts. The first is `import pandas`, which loads the `pandas` package.
-The second is `as pd`, which give the `pandas` package the much shorter *alias* (another name) `pd`.
+The second is `as pd`, which give the `pandas` package the much shorter *alias* (another name) `pd`.
We can now use the `read_csv` function by writing `pd.read_csv`, i.e., the package name, then a dot, then the function name.
You can see why we gave `pandas` a shorter alias; if we had to type `pandas.` before every function we wanted to use,
our code would become much longer and harder to read!
-Now that the `pandas` package is loaded, we can use the `read_csv` function by passing
+Now that the `pandas` package is loaded, we can use the `read_csv` function by passing
it a single argument: the name of the file, `"can_lang.csv"`. We have to
put quotes around file names and other letters and words that we use in our
code to distinguish it from the special words (like functions!) that make up the Python programming
language. The file's name is the only argument we need to provide because our
file satisfies everything else that the `read_csv` function expects in the default
use-case. {numref}`img-read-csv` describes how we use the `read_csv`
-to read data into Python.
-
-**(FIGURE 1.2 FROM R BOOK IS NOT MISSING, BUT STILL R VERSION. NEEDS PD.READ_CSV)**
+to read data into Python.
-```{figure} img/read_csv_function.jpeg
+```{figure} img/read_csv_function.png
---
-height: 200px
+height: 220px
name: img-read-csv
---
Syntax for the `read_csv` function
@@ -323,6 +322,7 @@ Syntax for the `read_csv` function
+++
```{code-cell} ipython3
+:tags: ["output_scroll"]
pd.read_csv("data/can_lang.csv")
```
@@ -332,11 +332,11 @@ pd.read_csv("data/can_lang.csv")
## Naming things in Python
When we loaded the 2016 Canadian census language data
-using `read_csv`, we did not give this data frame a name.
-Therefore the data was just printed on the screen,
-and we cannot do anything else with it. That isn't very useful.
-What would be more useful would be to give a name
-to the data frame that `read_csv` outputs,
+using `read_csv`, we did not give this data frame a name.
+Therefore the data was just printed on the screen,
+and we cannot do anything else with it. That isn't very useful.
+What would be more useful would be to give a name
+to the data frame that `read_csv` outputs,
so that we can refer to it later for analysis and visualization.
```{index} see: =; assignment symbol
@@ -345,7 +345,7 @@ so that we can refer to it later for analysis and visualization.
```{index} assignment symbol, string
```
-The way to assign a name to a value in Python is via the *assignment symbol* `=`.
+The way to assign a name to a value in Python is via the *assignment symbol* `=`.
On the left side of the assignment symbol you put the name that you want
to use, and on the right side of the assignment symbol
you put the value that you want the name to refer to.
@@ -360,17 +360,17 @@ my_number = 1 + 2
name = "Alice"
```
-Note that when
-we name something in Python using the assignment symbol, `=`,
-we do not need to surround the name we are creating with quotes. This is
+Note that when
+we name something in Python using the assignment symbol, `=`,
+we do not need to surround the name we are creating with quotes. This is
because we are formally telling Python that this special word denotes
the value of whatever is on the right-hand side.
Only characters and words that act as *values* on the right-hand side of the assignment
-symbol—e.g., the file name `"data/can_lang.csv"` that we specified before, or `"Alice"` above—need
+symbol—e.g., the file name `"data/can_lang.csv"` that we specified before, or `"Alice"` above—need
to be surrounded by quotes.
After making the assignment, we can use the special name words we have created in
-place of their values. For example, if we want to do something with the value `3` later on,
+place of their values. For example, if we want to do something with the value `3` later on,
we can just use `my_number` instead. Let's try adding 2 to `my_number`; you will see that
Python just interprets this as adding 2 and 3:
@@ -397,7 +397,7 @@ SyntaxError: cannot assign to operator
```{index} object; naming convention
```
-There are certain conventions for naming objects in Python.
+There are certain conventions for naming objects in Python.
When naming an object we
suggest using only lowercase letters, numbers and underscores `_` to separate
the words in a name. Python is case sensitive, which means that `Letter` and
@@ -408,23 +408,24 @@ remember what each name in your code represents. We recommend following the
**PEP 8** naming conventions outlined in the *[PEP 8](https://peps.python.org/pep-0008/)* {cite:p}`pep8-style-guide`. Let's
now use the assignment symbol to give the name
`can_lang` to the 2016 Canadian census language data frame that we get from
-`read_csv`.
+`read_csv`.
```{code-cell} ipython3
can_lang = pd.read_csv("data/can_lang.csv")
```
Wait a minute, nothing happened this time! Where's our data?
-Actually, something did happen: the data was loaded in
-and now has the name `can_lang` associated with it.
-And we can use that name to access the data frame and do things with it.
-For example, we can type the name of the data frame to print both the first few rows
+Actually, something did happen: the data was loaded in
+and now has the name `can_lang` associated with it.
+And we can use that name to access the data frame and do things with it.
+For example, we can type the name of the data frame to print both the first few rows
and the last few rows. The three dots (`...`) indicate that there are additional rows that are not printed.
-You will also see that the number of observations (i.e., rows) and
-variables (i.e., columns) are printed just underneath the data frame (214 rows and 6 columns in this case).
+You will also see that the number of observations (i.e., rows) and
+variables (i.e., columns) are printed just underneath the data frame (214 rows and 6 columns in this case).
Printing a few rows from data frame like this is a handy way to get a quick sense for what is contained in it.
```{code-cell} ipython3
+:tags: ["output_scroll"]
can_lang
```
@@ -435,8 +436,8 @@ can_lang
Now that we've loaded our data into Python, we can start wrangling the data to
find the ten Aboriginal languages that were most often reported
-in 2016 as mother tongues in Canada. In particular, we want to construct
-a table with the ten Aboriginal languages that have the largest
+in 2016 as mother tongues in Canada. In particular, we want to construct
+a table with the ten Aboriginal languages that have the largest
counts in the `mother_tongue` column. The first step is to extract
from our `can_lang` data only those rows that correspond to Aboriginal languages,
and then the second step is to keep only the `language` and `mother_tongue` columns.
@@ -457,8 +458,8 @@ and then use `loc[]` to do both in our analysis of the Aboriginal languages data
Looking at the `can_lang` data above, we see the column `category` contains different
high-level categories of languages, which include "Aboriginal languages",
"Non-Official & Non-Aboriginal languages" and "Official languages". To answer
-our question we want to filter our data set so we restrict our attention
-to only those languages in the "Aboriginal languages" category.
+our question we want to filter our data set so we restrict our attention
+to only those languages in the "Aboriginal languages" category.
```{index} pandas.DataFrame; [], filter, logical statement, logical statement; equivalency operator, string
```
@@ -476,20 +477,18 @@ column---denoted by `can_lang["category"]`---with the value `"Aboriginal languag
You will learn about many other kinds of logical
statement in the {ref}`wrangling` chapter. Similar to when we loaded the data file and put quotes
around the file name, here we need to put quotes around both `"Aboriginal languages"` and `"category"`. Using
-quotes tells Python that this is a *string value* (e.g., a column name, or word data)
-and not one of the special words that makes up the Python programming language,
+quotes tells Python that this is a *string value* (e.g., a column name, or word data)
+and not one of the special words that makes up the Python programming language,
or one of the names we have given to objects in the code we have already written.
> **Note:** In Python, single quotes (`'`) and double quotes (`"`) are generally
-> treated the same. So we could have written `'Aboriginal languages'` instead
+> treated the same. So we could have written `'Aboriginal languages'` instead
> of `"Aboriginal languages"` above, or `'category'` instead of `"category"`.
> Try both out for yourself!
-**(This figure is wrong-- should be for [] operation below)**
-
-```{figure} img/read_csv_function.jpeg
+```{figure} img/filter_rows.png
---
-height: 200px
+height: 220px
name: img-filter
---
Syntax for using the `[]` operation to filter rows.
@@ -499,6 +498,7 @@ This operation returns a data frame that has all the columns of the input data f
but only those rows corresponding to Aboriginal languages that we asked for in the logical statement.
```{code-cell} ipython3
+:tags: ["output_scroll"]
can_lang[can_lang["category"] == "Aboriginal languages"]
```
@@ -513,16 +513,14 @@ We can also use the `[]` operation to select columns from a data frame.
We again first type the name of the data frame---here, `can_lang`---followed
by square brackets. Inside the square brackets, we provide a *list* of
column names. In Python, we denote a *list* using square brackets, where
-each item is separated by a comma (`,`). So if we are interested in
+each item is separated by a comma (`,`). So if we are interested in
selecting only the `language` and `mother_tongue` columns from our original
`can_lang` data frame, we put the list `["language", "mother_tongue"]`
containing those two column names inside the square brackets of the `[]` operation.
-**(This figure is wrong-- should be for [] operation below)**
-
-```{figure} img/read_csv_function.jpeg
+```{figure} img/select_columns.png
---
-height: 200px
+height: 220px
name: img-select
---
Syntax for using the `[]` operation to select columns.
@@ -549,30 +547,30 @@ The syntax is very similar to the `[]` operation we have already covered: we wil
essentially combine both our row filtering and column selection steps from before.
In particular, we first write the name of the data frame---`can_lang` again---then follow
that with the `.loc[]` method. Inside the square brackets,
-we write our row filtering logical statement,
+we write our row filtering logical statement,
then a comma, then our list of columns to select.
-**(This figure is wrong-- should be for .loc[] operation below)**
-
-```{figure} img/read_csv_function.jpeg
+```{figure} img/filter_rows_and_columns.png
---
-height: 200px
+height: 220px
name: img-loc
---
Syntax for using the `loc[]` operation to filter rows and select columns.
```
```{code-cell} ipython3
-aboriginal_lang = can_lang.loc[can_lang["category"] == "Aboriginal languages", ["language", "mother_tongue"]]
+aboriginal_lang = can_lang.loc[
+ can_lang["category"] == "Aboriginal languages", ["language", "mother_tongue"]
+]
```
-There is one very important thing to notice in this code example.
+There is one very important thing to notice in this code example.
The first is that we used the `loc[]` operation on the `can_lang` data frame by
writing `can_lang.loc[]`---first the data frame name, then a dot, then `loc[]`.
There's that dot again! If you recall, earlier in this chapter we used the `read_csv` function from `pandas` (aliased as `pd`),
and wrote `pd.read_csv`. The dot means that the thing on the left (`pd`, i.e., the `pandas` package) *provides* the
thing on the right (the `read_csv` function). In the case of `can_lang.loc[]`, the thing on the left (the `can_lang` data frame)
-*provides* the thing on the right (the `loc[]` operation). In Python,
-both packages (like `pandas`) *and* objects (like our `can_lang` data frame) can provide functions
+*provides* the thing on the right (the `loc[]` operation). In Python,
+both packages (like `pandas`) *and* objects (like our `can_lang` data frame) can provide functions
and other objects that we access using the dot syntax.
At this point, if we have done everything correctly, `aboriginal_lang` should be a data frame
@@ -585,7 +583,7 @@ aboriginal_lang
```
We can see the original `can_lang` data set contained 214 rows
with multiple kinds of `category`. The data frame
-`aboriginal_lang` contains only 67 rows, and looks like it only contains Aboriginal languages.
+`aboriginal_lang` contains only 67 rows, and looks like it only contains Aboriginal languages.
So it looks like the `loc[]` operation gave us the result we wanted!
### Using `sort_values` to order and `head` to select rows by value
@@ -598,7 +596,7 @@ with only the Aboriginal languages in the data set and their associated counts.
However, we want to know the **ten** languages that are spoken most often. As a
next step, we will order the `mother_tongue` column from largest to smallest
value and then extract only the top ten rows. This is where the `sort_values`
-and `head` functions come to the rescue!
+and `head` functions come to the rescue!
The `sort_values` function allows us to order the rows of a data frame by the
values of a particular column. We need to specify the column name
@@ -609,7 +607,13 @@ language, we will use the `sort_values` function to order the rows in our
arrange the rows in descending order (from largest to smallest),
so we specify the argument `ascending` as `False`.
-**(FIGURE 1.5 FROM R BOOK MISSING HERE)**
+```{figure} img/sort_values.png
+---
+height: 220px
+name: img-sort-values
+---
+Syntax for using `sort_values` to arrange rows in decending order.
+```
```{code-cell} ipython3
arranged_lang = aboriginal_lang.sort_values(by='mother_tongue', ascending=False)
@@ -619,7 +623,7 @@ arranged_lang
Next, we will obtain the ten most common Aboriginal languages by selecting only
the first ten rows of the `arranged_lang` data frame.
We do this using the `head` function, and specifying the argument
-`10`.
+`10`.
```{code-cell} ipython3
@@ -627,16 +631,134 @@ ten_lang = arranged_lang.head(10)
ten_lang
```
-We have now answered our initial question by generating this table!
+## Combining analysis steps with chaining and multiline expressions
+
+```{index} chaining methods
+```
+
+It took us 3 steps to find the ten Aboriginal languages most often reported in
+2016 as mother tongues in Canada. Starting from the `can_lang` data frame, we:
+
+1) used `loc` to filter the rows so that only the
+ `Aboriginal languages` category remained, and selected the
+ `language` and `mother_tongue` columns,
+2) used `sort_values` to sort the rows by `mother_tongue` in descending order, and
+3) obtained only the top 10 values using `head`.
+
+One way of performing these steps is to just write
+multiple lines of code, storing temporary, intermediate objects as you go.
+```{code-cell} ipython3
+aboriginal_lang = can_lang.loc[can_lang["category"] == "Aboriginal languages", ["language", "mother_tongue"]]
+arranged_lang_sorted = aboriginal_lang.sort_values(by='mother_tongue', ascending=False)
+ten_lang = arranged_lang_sorted.head(10)
+```
+
+```{index} multi-line expression
+```
+
+You might find that code hard to read. You're not wrong; it is!
+There are two main issues with readability here. First, each line of code is quite long.
+It is hard to keep track of what methods are being called, and what arguments were used.
+Second, each line introduces a new temporary object. In this case, both `aboriginal_lang` and `arranged_lang_sorted`
+are just temporary results on the way to producing the `ten_lang` data frame.
+This makes the code hard to read, as one has to trace where each temporary object
+goes, and hard to understand, since introducing many named objects also suggests that they
+are of some importance, when really they are just intermediates.
+The need to call multiple methods in a sequence to process a data frame is
+quite common, so this is an important issue to address!
+
+To solve the first problem, we can actually split the long expressions above across
+multiple lines. Although in most cases, a single expression in Python must be contained
+in a single line of code, there are a small number of situations where lets us do this.
+Let's rewrite this code in a more readable format using multiline expressions.
+
+```{code-cell} ipython3
+aboriginal_lang = can_lang.loc[
+ can_lang["category"] == "Aboriginal languages", ["language", "mother_tongue"]
+]
+arranged_lang_sorted = aboriginal_lang.sort_values(
+ by='mother_tongue', ascending=False
+)
+ten_lang = arranged_lang_sorted.head(10)
+```
+
+This code is the same as the code we showed earlier; you can see the same
+sequence of methods and arguments is used. But long expressions are split
+across multiple lines when they would otherwise get long and unwieldy,
+improving the readability of the code.
+How does Python know when to keep
+reading on the next line for a single expression?
+For the line starting with `aboriginal_lang = ...`, Python sees that the line ends with a left
+bracket symbol `[`, and knows that our
+expression cannot end until we close it with an appropriate corresponding right bracket symbol `]`.
+We put the same two arguments as we did before, and then
+the corresponding right bracket appears after `["language", "mother_tongue"]`).
+For the line starting with `arranged_lang_sorted = ...`, Python sees that the line ends with a left parenthesis symbol `(`,
+and knows the expression cannot end until we close it with the corresponding right parenthesis symbol `)`.
+Again we use the same two arguments as before, and then the
+corresponding right parenthesis appears right after `ascending=False`.
+In both cases, Python keeps reading the next line to figure out
+what the rest of the expression is. We could, of course,
+put all of the code on one line of code, but splitting it across
+multiple lines helps a lot with code readability.
+
+We still have to handle the issue that each line of code---i.e., each step in the analysis---introduces
+a new temporary object. To address this issue, we can *chain* multiple operations together without
+assigning intermediate objects. The key idea of chaining is that the *output* of
+each step in the analysis is a data frame, which means that you can just directly keep calling methods
+that operate on the output of each step in a sequence! This simplifies the code and makes it
+easier to read. The code below demonstrates the use of both multiline expressions and chaining together.
+The code is now much cleaner, and the `ten_lang` data frame that we get is equivalent to the one
+from the messy code above!
+
+```{code-cell} ipython3
+# obtain the 10 most common Aboriginal languages
+ten_lang = (
+ can_lang.loc[
+ can_lang["category"] == "Aboriginal languages",
+ ["language", "mother_tongue"]
+ ]
+ .sort_values(by="mother_tongue", ascending=False)
+ .head(10)
+)
+ten_lang
+```
+
+Let's parse this new block of code piece by piece.
+The code above starts with a left parenthesis, `(`, and so Python
+knows to keep reading to subsequent lines until it finds the corresponding
+right parenthesis symbol `)`. The `loc` method performs the filtering and selecting steps as before. The line after this
+starts with a period (`.`) that "chains" the output of the `loc` step with the next operation,
+`sort_values`. Since the output of `loc` is a data frame, we can use the `sort_values` method on it
+without first giving it a name! That is what the `.sort_values` does on the next line.
+Finally, we once again "chain" together the output of `sort_values` with `head` to ask for the 10
+most common languages. Finally, the right parenthesis `)` corresponding to the very first left parenthesis
+appears on the second last line, completing the multiline expression.
+Instead of creating intermediate objects, with chaining, we take the output of
+one operation and use that to perform the next operation. In doing so, we remove the need to create and
+store intermediates. This can help with readability by simplifying the code.
+
+Now that we've shown you chaining as an alternative to storing
+temporary objects and composing code, does this mean you should *never* store
+temporary objects or compose code? Not necessarily!
+There are times when temporary objects are handy to keep around.
+For example, you might store a temporary object before feeding it into a plot function
+so you can iteratively change the plot without having to
+redo all of your data transformations.
+Chaining many functions can be overwhelming and difficult to debug;
+you may want to store a temporary object midway through to inspect your result
+before moving on with further steps.
+
+We have now answered our initial question by generating the `ten_lang` table!
Are we done? Well, not quite; tables are almost never the best way to present
the result of your analysis to your audience. Even the simple table above with
only two columns presents some difficulty: for example, you have to scrutinize
-the table quite closely to get a sense for the relative numbers of speakers of
-each language. When you move on to more complicated analyses, this issue only
-gets worse. In contrast, a *visualization* would convey this information in a much
-more easily understood format.
+the table quite closely to get a sense for the relative numbers of speakers of
+each language. When you move on to more complicated analyses, this issue only
+gets worse. In contrast, a *visualization* would convey this information in a much
+more easily understood format.
Visualizations are a great tool for summarizing information to help you
-effectively communicate with your audience.
+effectively communicate with your audience.
## Exploring data with visualizations
@@ -644,7 +766,7 @@ effectively communicate with your audience.
```
Creating effective data visualizations is an essential component of any data
-analysis. In this section we will develop a visualization of the
+analysis. In this section we will develop a visualization of the
ten Aboriginal languages that were most often reported in 2016 as mother tongues in
Canada, as well as the number of people that speak each of them.
@@ -670,9 +792,9 @@ formally introduce tidy data in the {ref}`wrangling` chapter.
We will make a bar plot to visualize our data. A bar plot is a chart where the
lengths of the bars represent certain values, like counts or proportions. We
will make a bar plot using the `mother_tongue` and `language` columns from our
-`ten_lang` data frame. To create a bar plot of these two variables using the
+`ten_lang` data frame. To create a bar plot of these two variables using the
`altair` package, we must specify the data frame, which variables
-to put on the x and y axes, and what kind of plot to create.
+to put on the x and y axes, and what kind of plot to create.
First, we need to import the `altair` package.
```{code-cell} ipython3
@@ -683,16 +805,22 @@ import altair as alt
+++
The fundamental object in `altair` is the `Chart`, which takes a data frame as a single argument: `alt.Chart(ten_lang)`.
-With a chart object in hand, we can now specify how we would like the data to be visualized.
-We first indicate what kind of geometric mark we want to use to represent the data. Here we set the mark attribute
+With a chart object in hand, we can now specify how we would like the data to be visualized.
+We first indicate what kind of geometric mark we want to use to represent the data. Here we set the mark attribute
of the chart object using the `Chart.mark_bar` function, because we want to create a bar chart.
-Next, we need to encode the variables of the data frame using
-the `x` (represents the x-axis position of the points) and
+Next, we need to encode the variables of the data frame using
+the `x` (represents the x-axis position of the points) and
`y` (represents the y-axis position of the points) *channels*. We use the `encode()`
function to handle this: we specify that the `language` column should correspond to the x-axis,
and that the `mother_tongue` column should correspond to the y-axis.
-**(FIGURE 1.6 FROM R BOOK IS MISSING)**
+```{figure} img/altair_syntax.png
+---
+height: 220px
+name: img-altair
+---
+Syntax for using `altair` to make a bar chart.
+```
+++
@@ -700,12 +828,9 @@ and that the `mother_tongue` column should correspond to the y-axis.
:tags: []
barplot_mother_tongue = (
- alt.Chart(ten_lang)
- .mark_bar().encode(
- x="language",
- y="mother_tongue"
- ))
-
+ alt.Chart(ten_lang).mark_bar().encode(x="language", y="mother_tongue")
+)
+
```
@@ -728,20 +853,6 @@ Bar plot of the ten Aboriginal languages most often reported by Canadian residen
```{index} see: .; chaining methods
```
-```{index} multi-line expression
-```
-
-> **Note:** The vast majority of the
-> time, a single expression in Python must be contained in a single line of code.
-> However, there *are* a small number of situations in which you can have a
-> single Python expression span multiple lines. Above is one such case: here, Python sees that we put a left
-> parenthesis symbol `(` on the first line right after the assignment symbol `=`, and knows that our
-> expression cannot end until we close it with an appropriate corresponding right parenthesis symbol `)`.
-> So Python keeps reading the next line to figure out
-> what the rest of the expression is. We could, of course,
-> put all of the code on one line of code, but splitting it across
-> multiple lines helps a lot with code readability.
-
### Formatting `altair` objects
It is exciting that we can already visualize our data to help answer our
@@ -760,8 +871,8 @@ Canadian Residents)" would be much more informative.
```
Adding additional labels to our visualizations that we create in `altair` is
-one common and easy way to improve and refine our data visualizations. We can add titles for the axes
-in the `altair` objects using `alt.X` and `alt.Y` with the `title` argument to make
+one common and easy way to improve and refine our data visualizations. We can add titles for the axes
+in the `altair` objects using `alt.X` and `alt.Y` with the `title` argument to make
the axes titles more informative.
Again, since we are specifying
words (e.g. `"Mother Tongue (Number of Canadian Residents)"`) as arguments to
@@ -795,7 +906,7 @@ Bar plot of the ten Aboriginal languages most often reported by Canadian residen
:::
-The result is shown in {numref}`barplot-mother-tongue-labs`.
+The result is shown in {numref}`barplot-mother-tongue-labs`.
This is already quite an improvement! Let's tackle the next major issue with the visualization
in {numref}`barplot-mother-tongue-labs`: the vertical x axis labels, which are
currently making it difficult to read the different language names.
@@ -830,14 +941,14 @@ Horizontal bar plot of the ten Aboriginal languages most often reported by Canad
```{index} altair; sort
```
-Another big step forward, as shown in {numref}`barplot-mother-tongue-labs-axis`! There
+Another big step forward, as shown in {numref}`barplot-mother-tongue-labs-axis`! There
are no more serious issues with the visualization. Now comes time to refine
the visualization to make it even more well-suited to answering the question
we asked earlier in this chapter. For example, the visualization could be made more transparent by
organizing the bars according to the number of Canadian residents reporting
each language, rather than in alphabetical order. We can reorder the bars using
the `sort` argument, which orders a variable (here `language`) based on the
-values of the variable(`mother_tongue`) on the `x-axis`.
+values of the variable(`mother_tongue`) on the `x-axis`.
```{code-cell} ipython3
ordered_barplot_mother_tongue = (
@@ -864,7 +975,7 @@ glue('barplot-mother-tongue-reorder', ordered_barplot_mother_tongue, display=Tru
:name: barplot-mother-tongue-reorder
Bar plot of the ten Aboriginal languages most often reported by Canadian residents as their mother tongue with bars reordered.
-:::
+:::
{numref}`barplot-mother-tongue-reorder` provides a very clear and well-organized
@@ -878,7 +989,7 @@ n.o.s. with over 60,000 Canadian residents reporting it as their mother tongue.
> Cree languages include the following categories: Cree n.o.s., Swampy Cree,
> Plains Cree, Woods Cree, and a 'Cree not included elsewhere' category (which
> includes Moose Cree, Northern East Cree and Southern East Cree)
-> {cite:p}`language2016`.
+> {cite:p}`language2016`.
### Putting it all together
@@ -890,12 +1001,12 @@ n.o.s. with over 60,000 Canadian residents reporting it as their mother tongue.
In the block of code below, we put everything from this chapter together, with a few
modifications. In particular, we have combined all of our steps into one expression
-split across multiple lines using the left and right parenthesis symbols `(` and `)`.
-We have also provided *comments* next to
+split across multiple lines using the left and right parenthesis symbols `(` and `)`.
+We have also provided *comments* next to
many of the lines of code below using the
-hash symbol `#`. When Python sees a `#` sign, it
+hash symbol `#`. When Python sees a `#` sign, it
will ignore all of the text that
-comes after the symbol on that line. So you can use comments to explain lines
+comes after the symbol on that line. So you can use comments to explain lines
of code for others, and perhaps more importantly, your future self!
It's good practice to get in the habit of
commenting your code to improve its readability.
@@ -905,7 +1016,7 @@ performed an entire data science workflow with a highly effective data
visualization! We asked a question, loaded the data into Python, wrangled the data
(using `[]`, `loc[]`, `sort_values`, and `head`) and created a data visualization to
help answer our question. In this chapter, you got a quick taste of the data
-science workflow; continue on with the next few chapters to learn each of
+science workflow; continue on with the next few chapters to learn each of
these steps in much more detail!
```{code-cell} ipython3
@@ -956,16 +1067,16 @@ Bar plot of the ten Aboriginal languages most often reported by Canadian residen
```{index} see: __doc__; documentation
```
-There are many Python functions in the `pandas` package (and beyond!), and
+There are many Python functions in the `pandas` package (and beyond!), and
nobody can be expected to remember what every one of them does
-or all of the arguments we have to give them. Fortunately, Python provides
-the `help` function, which
-provides an easy way to pull up the documentation for
-most functions quickly. To use the `help` function to access the documentation, you
+or all of the arguments we have to give them. Fortunately, Python provides
+the `help` function, which
+provides an easy way to pull up the documentation for
+most functions quickly. To use the `help` function to access the documentation, you
just put the name of the function you are curious about as an argument inside the `help` function.
For example, if you had forgotten what the `pd.read_csv` function
did or exactly what arguments to pass in, you could run the following
-code:
+code:
```{code-cell} ipython3
:tags: ["remove-output"]
@@ -973,11 +1084,11 @@ help(pd.read_csv)
```
{numref}`help_read_csv` shows the documentation that will pop up,
-including a high-level description of the function, its arguments,
+including a high-level description of the function, its arguments,
a description of each, and more. Note that you may find some of the
text in the documentation a bit too technical right now.
Fear not: as you work through this book, many of these terms will be introduced
-to you, and slowly but surely you will become more adept at understanding and navigating
+to you, and slowly but surely you will become more adept at understanding and navigating
documentation like that shown in {numref}`help_read_csv`. But do keep in mind that the documentation
is not written to *teach* you about a function; it is just there as a reference to *remind*
you about the different arguments and usage of functions that you have already learned about elsewhere.
@@ -994,14 +1105,55 @@ The documentation for the read_csv function including a high-level description,
+++
-If you are working in a Jupyter Lab environment, there are also two more convenient
-ways to access documentation for functions. **JOEL ADD TEXT AND IMAGES HERE**.
+If you are working in a Jupyter Lab environment, there are some conveniences that will help you lookup function names
+and access the documentation.
+You can type the first characters of the function you want to use,
+and then press Tab to bring up small menu
+that shows you all the available functions
+that starts with those characters.
+This is helpful both for remembering function names
+and to prevent typos.
+
++++
+
+```{figure} img/completion_menu.png
+---
+height: 400px
+name: completion_menu
+---
+The suggestions that are shown after typing `pd.read` and pressing Tab.
+```
+
++++
+
+To get more info on the function you want to use,
+you can type out the full name
+and then hold Shift while pressing Tab
+to bring up a help dialogue including the same information as when using `help()`.
+
++++
+
+```{figure} img/help_dialog.png
+---
+height: 400px
+name: help_dialog
+---
+The help dialog that is shown after typing `pd.read_csv` and then pressing Shift + Tab.
+```
+
++++
+Finally,
+it can be helpful to have this help dialog open at all times,
+especially when you start out learning about programming and data science.
+You can achieve this by clicking on the `Help` text
+in the menu bar at the top
+and then selecting `Show Contextual Help`.
## Exercises
-Practice exercises for the material covered in this chapter
-can be found in the accompanying
+Practice exercises for the material covered in this chapter
+can be found in the accompanying
[worksheets repository](https://github.com/UBC-DSCI/data-science-a-first-intro-python-worksheets#readme)
in the "Python and Pandas" row.
You can launch an interactive version of the worksheet in your browser by clicking the "launch binder" button.
diff --git a/source/preface-text.md b/source/preface-text.md
index 75ae6344..139fe55c 100644
--- a/source/preface-text.md
+++ b/source/preface-text.md
@@ -13,11 +13,16 @@ kernelspec:
name: python3
---
-# Preface -- TBD
+# Preface
+
+```{index} data science, auditable, reproducible
+```
+
+
This textbook aims to be an approachable introduction to the world of data science.
-In this book, we define **data science** \index{data science!definition} as the process of generating
-insight from data through **reproducible** \index{reproducible} and **auditable** \index{auditable} processes.
+In this book, we define **data science** as the process of generating
+insight from data through **reproducible** and **auditable** processes.
If you analyze some data and give your analysis to a friend or colleague, they should
be able to re-run the analysis from start to finish and get the same result you did (*reproducibility*).
They should also be able to see and understand all the steps in the analysis, as well as the history of how
@@ -29,19 +34,17 @@ At a high level, in this book, you will learn how to
(1) identify common problems in data science, and
(2) solve those problems with reproducible and auditable workflows.
-Figure \@ref(fig:img-chapter-overview) summarizes what you will learn in each chapter
-of this book.
-Throughout, you will learn how to use the R programming language [@Rlanguage] to perform
+{numref}`preface-overview-fig` summarizes what you will learn in each chapter
+of this book. Throughout, you will learn how to use the [Python programming language](https://www.python.org/) to perform
all the tasks associated with data analysis. You will
-spend the first four chapters learning how to use R to load, clean, wrangle
+spend the first four chapters learning how to use Python to load, clean, wrangle
(i.e., restructure the data into a usable format) and visualize data
while answering descriptive and exploratory data analysis questions. In the next
six chapters, you will learn how to answer predictive, exploratory, and inferential
data analysis questions with common methods in data science, including
classification, regression, clustering, and estimation.
In the final chapters
-(\@ref(getting-started-with-jupyter)–\@ref(move-to-your-own-machine)),
-you will learn how to combine R code, formatted text, and images
+you will learn how to combine Python code, formatted text, and images
in a single coherent document with Jupyter, use version control for
collaboration, and install and configure the software needed for data science
on your own computer. If you are reading this book as part of a course that you are
@@ -51,20 +54,26 @@ But if you are reading this independently, you may want to jump to these last th
early before going on to make sure your computer is set up in such a way that you can
try out the example code that we include throughout the book.
-```{r img-chapter-overview, echo = FALSE, message = FALSE, warning = FALSE, fig.cap = "Where are we going?", out.width="100%", fig.retina = 2, fig.align = "center"}
-knitr::include_graphics("img/chapter_overview.jpeg")
+```{figure} img/chapter_overview.jpeg
+---
+height: 400px
+name: preface-overview-fig
+---
+Where are we going?
```
+
+
Each chapter in the book has an accompanying worksheet that provides exercises
to help you practice the concepts you will learn. We strongly recommend that you
work through the worksheet when you finish reading each chapter
before moving on to the next chapter. All of the worksheets
are available at
-[https://github.com/UBC-DSCI/data-science-a-first-intro-worksheets#readme](https://github.com/UBC-DSCI/data-science-a-first-intro-worksheets#readme);
+[https://github.com/UBC-DSCI/data-science-a-first-intro-python-worksheets#readme](https://github.com/UBC-DSCI/data-science-a-first-intro-python-worksheets#readme);
the "Exercises" section at the end of each chapter points you to the right worksheet for that chapter.
For each worksheet, you can either launch an interactive version of the worksheet in your browser by clicking the "launch binder" button,
or preview a non-interactive version of the worksheet by clicking "view worksheet."
If you instead decide to download the worksheet and run it on your own machine,
make sure to follow the instructions for computer setup
-found in Chapter \@ref(move-to-your-own-machine). This will ensure that the automated feedback
+found in the {ref}`move-to-your-own-machine` chapter. This will ensure that the automated feedback
and guidance that the worksheets provide will function as intended.
diff --git a/source/reading.md b/source/reading.md
index 4febd2cd..4182df15 100644
--- a/source/reading.md
+++ b/source/reading.md
@@ -16,7 +16,7 @@ kernelspec:
# Reading in data locally and from the web
-## Overview
+## Overview
```{index} see: loading; reading
```
@@ -46,10 +46,10 @@ By the end of the chapter, readers will be able to do the following:
- **U**niform **R**esource **L**ocator (URL)
- Read data into Python using an absolute path, relative path and a URL.
- Compare and contrast the following functions:
- - `read_csv`
+ - `read_csv`
- `read_excel`
- Match the following `pandas` `read_csv` function arguments to their descriptions:
- - `filepath_or_buffer`
+ - `filepath_or_buffer`
- `sep`
- `names`
- `skiprows`
@@ -76,7 +76,7 @@ This chapter will discuss the different functions we can use to import data
into Python, but before we can talk about *how* we read the data into Python with these
functions, we first need to talk about *where* the data lives. When you load a
data set into Python, you first need to tell Python where those files live. The file
-could live on your computer (*local*) or somewhere on the internet (*remote*).
+could live on your computer (*local*) or somewhere on the internet (*remote*).
The place where the file lives on your computer is called the "path". You can
think of the path as directions to the file. There are two kinds of paths:
@@ -90,7 +90,7 @@ in respect to the computer's filesystem base (or root) folder.
Suppose our computer's filesystem looks like the picture in
{numref}`Filesystem`, and we are working in a
-file titled `worksheet_02.ipynb`. If we want to
+file titled `worksheet_02.ipynb`. If we want to
read the `.csv` file named `happiness_report.csv` into Python, we could do this
using either a relative or an absolute path. We show both choices
below.
@@ -124,24 +124,24 @@ happy_data = pd.read_csv("/home/dsci-100/worksheet_02/data/happiness_report.csv"
+++
-So which one should you use? Generally speaking, to ensure your code can be run
-on a different computer, you should use relative paths. An added bonus is that
-it's also less typing! Generally, you should use relative paths because the file's
-absolute path (the names of
-folders between the computer's root `/` and the file) isn't usually the same
-across different computers. For example, suppose Fatima and Jayden are working on a
-project together on the `happiness_report.csv` data. Fatima's file is stored at
+So which one should you use? Generally speaking, to ensure your code can be run
+on a different computer, you should use relative paths. An added bonus is that
+it's also less typing! Generally, you should use relative paths because the file's
+absolute path (the names of
+folders between the computer's root `/` and the file) isn't usually the same
+across different computers. For example, suppose Fatima and Jayden are working on a
+project together on the `happiness_report.csv` data. Fatima's file is stored at
```
/home/Fatima/project/data/happiness_report.csv
```
-while Jayden's is stored at
+while Jayden's is stored at
```
/home/Jayden/project/data/happiness_report.csv
```
-
+
Even though Fatima and Jayden stored their files in the same place on their
computers (in their home folders), the absolute paths are different due to
their different usernames. If Jayden has code that loads the
@@ -154,10 +154,10 @@ relative paths will work on both!
```
Your file could be stored locally, as we discussed, or it could also be
-somewhere on the internet (remotely). For this purpose we use a
+somewhere on the internet (remotely). For this purpose we use a
*Uniform Resource Locator (URL)*, i.e., a web address that looks something
like https://google.com/. URLs indicate the location of a resource on the internet and
-helps us retrieve that resource.
+helps us retrieve that resource.
## Reading tabular data from a plain text file into Python
@@ -168,26 +168,26 @@ helps us retrieve that resource.
```
Now that we have learned about *where* data could be, we will learn about *how*
-to import data into Python using various functions. Specifically, we will learn how
+to import data into Python using various functions. Specifically, we will learn how
to *read* tabular data from a plain text file (a document containing only text)
*into* Python and *write* tabular data to a file *out of* Python. The function we use to do this
depends on the file's format. For example, in the last chapter, we learned about using
the `read_csv` function from `pandas` when reading `.csv` (**c**omma-**s**eparated **v**alues)
files. In that case, the *separator* that divided our columns was a
-comma (`,`). We only learned the case where the data matched the expected defaults
-of the `read_csv` function
-(column names are present, and commas are used as the separator between columns).
-In this section, we will learn how to read
+comma (`,`). We only learned the case where the data matched the expected defaults
+of the `read_csv` function
+(column names are present, and commas are used as the separator between columns).
+In this section, we will learn how to read
files that do not satisfy the default expectations of `read_csv`.
```{index} Canadian languages; canlang data
```
-Before we jump into the cases where the data aren't in the expected default format
+Before we jump into the cases where the data aren't in the expected default format
for `pandas` and `read_csv`, let's revisit the more straightforward
case where the defaults hold, and the only argument we need to give to the function
-is the path to the file, `data/can_lang.csv`. The `can_lang` data set contains
-language data from the 2016 Canadian census.
+is the path to the file, `data/can_lang.csv`. The `can_lang` data set contains
+language data from the 2016 Canadian census.
We put `data/` before the file's
name when we are loading the data set because this data set is located in a
sub-folder, named `data`, relative to where we are running our Python code.
@@ -209,18 +209,19 @@ Non-Official & Non-Aboriginal languages,Amharic,22465,12785,200,33670
```{index} pandas
```
-And here is a review of how we can use `read_csv` to load it into Python. First we
+And here is a review of how we can use `read_csv` to load it into Python. First we
load the `pandas` package to gain access to useful
-functions for reading the data.
+functions for reading the data.
```{code-cell} ipython3
-import pandas as pd
+import pandas as pd
```
Next we use `read_csv` to load the data into Python, and in that call we specify the
relative path to the file.
```{code-cell} ipython3
+:tags: ["output_scroll"]
canlang_data = pd.read_csv("data/can_lang.csv")
canlang_data
```
@@ -269,19 +270,20 @@ ParserError: Error tokenizing data. C error: Expected 1 fields in line 4, saw 6
```{index} read function; skiprows argument
```
-To successfully read data like this into Python, the `skiprows`
-argument can be useful to tell Python
+To successfully read data like this into Python, the `skiprows`
+argument can be useful to tell Python
how many rows to skip before
it should start reading in the data. In the example above, we would set this
value to 3 to read and load the data correctly.
```{code-cell} ipython3
+:tags: ["output_scroll"]
canlang_data = pd.read_csv("data/can_lang_meta-data.csv", skiprows=3)
canlang_data
```
How did we know to skip three rows? We looked at the data! The first three rows
-of the data had information we didn't need to import:
+of the data had information we didn't need to import:
```code
Data source: https://ttimbers.github.io/canlang/
@@ -289,13 +291,13 @@ Data originally published in: Statistics Canada Census of Population 2016.
Reproduced and distributed on an as-is basis with their permission.
```
-The column names began at row 4, so we skipped the first three rows.
+The column names began at row 4, so we skipped the first three rows.
### Using the `sep` argument for different separators
Another common way data is stored is with tabs as the separator. Notice the
data file, `can_lang.tsv`, has tabs in between the columns instead of
-commas.
+commas.
```code
category language mother_tongue most_at_home most_at_work lang_known
@@ -318,26 +320,27 @@ Non-Official & Non-Aboriginal languages Amharic 22465 12785 200 33670
```{index} tsv, read function; read_tsv
```
-To read in `.tsv` (**t**ab **s**eparated **v**alues) files, we can set the `sep` argument
+To read in `.tsv` (**t**ab **s**eparated **v**alues) files, we can set the `sep` argument
in the `read_csv` function to the *tab character* `\t`.
```{index} escape character
```
-> **Note:** `\t` is an example of an *escaped character*,
+> **Note:** `\t` is an example of an *escaped character*,
> which always starts with a backslash (`\`).
-> Escaped characters are used to represent non-printing characters
+> Escaped characters are used to represent non-printing characters
> (like the tab) or characters with special meanings (such as quotation marks).
```{code-cell} ipython3
+:tags: ["output_scroll"]
canlang_data = pd.read_csv("data/can_lang.tsv", sep="\t")
canlang_data
```
Let's compare the data frame here to the resulting data frame in Section
{ref}`readcsv` after using `read_csv`. Notice anything? They look the same; they have
-the same number of columns and rows, and have the same column names!
+the same number of columns and rows, and have the same column names!
So even though we needed to use different
arguments depending on the file format, our resulting data frame
(`canlang_data`) in both cases was the same.
@@ -365,7 +368,7 @@ Non-Official & Non-Aboriginal languages Amharic 22465 12785 200 33670
```
Data frames in Python need to have column names. Thus if you read in data that
-don't have column names, Python will assign names automatically. In this example,
+don't have column names, Python will assign names automatically. In this example,
Python assigns each column a name of `0, 1, 2, 3, 4, 5`.
To read this data into Python, we specify the first
argument as the path to the file (as done with `read_csv`), and then provide
@@ -374,9 +377,10 @@ and finally set `header = None` to tell `pandas` that the data file does not
contain its own column names.
```{code-cell} ipython3
+:tags: ["output_scroll"]
canlang_data = pd.read_csv(
- "data/can_lang_no_cols.tsv",
- sep = "\t",
+ "data/can_lang_no_cols.tsv",
+ sep = "\t",
header = None
)
canlang_data
@@ -387,10 +391,10 @@ canlang_data
It is best to rename your columns manually in this scenario. The current column names
(`0, 1`, etc.) are problematic for two reasons: first, because they not very descriptive names, which will make your analysis
-confusing; and second, because your column names should generally be *strings*, but are currently *integers*.
+confusing; and second, because your column names should generally be *strings*, but are currently *integers*.
To rename your columns, you can use the `rename` function
-from the [pandas package](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rename.html#).
-The argument of the `rename` function is `columns`, which takes a mapping between the old column names and the new column names.
+from the [pandas package](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rename.html#).
+The argument of the `rename` function is `columns`, which takes a mapping between the old column names and the new column names.
In this case, we want to rename the old columns (`0, 1, ..., 5`) in the `canlang_data` data frame to more descriptive names.
To specify the mapping, we create a *dictionary*: a Python object that represents
@@ -400,6 +404,7 @@ Below, we create a dictionary called `col_map` that maps the old column names in
names, and then pass it to the `rename` function.
```{code-cell} ipython3
+:tags: ["output_scroll"]
col_map = {
0 : "category",
1 : "language",
@@ -415,10 +420,11 @@ canlang_data_renamed
```{index} read function; names argument
```
-The column names can also be assigned to the data frame immediately upon reading it from the file by passing a
-list of column names to the `names` argument in `read_csv`.
+The column names can also be assigned to the data frame immediately upon reading it from the file by passing a
+list of column names to the `names` argument in `read_csv`.
```{code-cell} ipython3
+:tags: ["output_scroll"]
canlang_data = pd.read_csv(
"data/can_lang_no_cols.tsv",
sep="\t",
@@ -448,6 +454,7 @@ path on our local computer. All other arguments that we use are the same as
when using these functions with a local file on our computer.
```{code-cell} ipython3
+:tags: ["output_scroll"]
url = "https://raw.githubusercontent.com/UBC-DSCI/introduction-to-datascience-python/reading/source/data/can_lang.csv"
pd.read_csv(url)
canlang_data = pd.read_csv(url)
@@ -497,8 +504,8 @@ t 8f??3wn
?Pd(??J-?E???7?'t(?-GZ?????y???c~N?g[^_r?4
yG?O
?K??G?
-
-
+
+
]TUEe??O??c[???????6q??s??d?m???\???H?^????3} ?rZY? ?:L60?^?????XTP+?|?
X?a??4VT?,D?Jq
```
@@ -509,11 +516,12 @@ X?a??4VT?,D?Jq
This type of file representation allows Excel files to store additional things
that you cannot store in a `.csv` file, such as fonts, text formatting,
graphics, multiple sheets and more. And despite looking odd in a plain text
-editor, we can read Excel spreadsheets into Python using the `pandas` package's `read_excel`
-function developed specifically for this
+editor, we can read Excel spreadsheets into Python using the `pandas` package's `read_excel`
+function developed specifically for this
purpose.
```{code-cell} ipython3
+:tags: ["output_scroll"]
canlang_data = pd.read_excel("data/can_lang.xlsx")
canlang_data
```
@@ -522,13 +530,13 @@ If the `.xlsx` file has multiple sheets, you have to use the `sheet_name` argume
to specify the sheet number or name. This functionality is useful when a single sheet contains
multiple tables (a sad thing that happens to many Excel spreadsheets since this
makes reading in data more difficult). You can also specify cell ranges using the
-`usecols` argument (e.g., `usecols="A:D"` for including columns from `A` to `D`).
+`usecols` argument (e.g., `usecols="A:D"` for including columns from `A` to `D`).
As with plain text files, you should always explore the data file before
importing it into Python. Exploring the data beforehand helps you decide which
arguments you need to load the data into Python successfully. If you do not have
the Excel program on your computer, you can use other programs to preview the
-file. Examples include Google Sheets and Libre Office.
+file. Examples include Google Sheets and Libre Office.
In {numref}`read_func` we summarize the `read_csv` and `read_excel` functions we covered
in this chapter. We also include the arguments for data separated by
@@ -547,20 +555,20 @@ European countries).
* - Comma (`,`) separated files
- `read_csv`
- just the file path
-* - Tab (`\t`) separated files
+* - Tab (`\t`) separated files
- `read_csv`
- `sep="\t"`
* - Missing header
- `read_csv`
- `header=None`
* - European-style numbers, semicolon (`;`) separators
- - `read_csv`
+ - `read_csv`
- `sep=";"`, `thousands="."`, `decimal=","`
* - Excel files (`.xlsx`)
- `read_excel`
- `sheet_name`, `usecols`
-
-
+
+
```
## Reading data from a database
@@ -576,7 +584,7 @@ different relational database management systems each have their own advantages
and limitations. Almost all employ SQL (*structured query language*) to obtain
data from the database. But you don't need to know SQL to analyze data from
a database; several packages have been written that allow you to connect to
-relational databases and use the Python programming language
+relational databases and use the Python programming language
to obtain data. In this book, we will give examples of how to do this
using Python with SQLite and PostgreSQL databases.
@@ -588,8 +596,8 @@ using Python with SQLite and PostgreSQL databases.
SQLite is probably the simplest relational database system
that one can use in combination with Python. SQLite databases are self-contained and
usually stored and accessed locally on one computer. Data is usually stored in
-a file with a `.db` extension (or sometimes a `.sqlite` extension).
-Similar to Excel files, these are not plain text files and cannot be read in a plain text editor.
+a file with a `.db` extension (or sometimes a `.sqlite` extension).
+Similar to Excel files, these are not plain text files and cannot be read in a plain text editor.
```{index} database; connect, ibis, ibis; ibis
```
@@ -598,18 +606,18 @@ Similar to Excel files, these are not plain text files and cannot be read in a p
```
The first thing you need to do to read data into Python from a database is to
-connect to the database. For an SQLite database, we will do that using
+connect to the database. For an SQLite database, we will do that using
the `connect` function from the
`sqlite` backend in the
`ibis` package. This command does not read
in the data, but simply tells Python where the database is and opens up a
communication channel that Python can use to send SQL commands to the database.
-> **Note:** There is another database package in python called `sqlalchemy`.
+> **Note:** There is another database package in python called `sqlalchemy`.
> That package is a bit more mature than `ibis`,
-> so if you want to dig deeper into working with databases in Python, that is a good next
-> package to learn about. We will work with `ibis` in this book, as it
-> provides a more modern and friendlier syntax that is more like `pandas` for data analysis code.
+> so if you want to dig deeper into working with databases in Python, that is a good next
+> package to learn about. We will work with `ibis` in this book, as it
+> provides a more modern and friendlier syntax that is more like `pandas` for data analysis code.
```{code-cell} ipython3
import ibis
@@ -621,7 +629,7 @@ conn = ibis.sqlite.connect("data/can_lang.db")
```
Often relational databases have many tables; thus, in order to retrieve
-data from a database, you need to know the name of the table
+data from a database, you need to know the name of the table
in which the data is stored. You can get the names of
all the tables in the database using the `list_tables`
function:
@@ -636,22 +644,22 @@ tables
The `list_tables` function returned only one name---`"can_lang"`---which tells us
that there is only one table in this database. To reference a table in the
-database (so that we can perform operations like selecting columns and filtering rows), we
+database (so that we can perform operations like selecting columns and filtering rows), we
use the `table` function from the `conn` object. The object returned
by the `table` function allows us to work with data
stored in databases as if they were just regular `pandas` data frames; but secretly, behind
-the scenes, `ibis` will turn your commands into SQL queries!
+the scenes, `ibis` will turn your commands into SQL queries!
```{code-cell} ipython3
canlang_table = conn.table("can_lang")
-canlang_table
+canlang_table
```
```{index} database; count, ibis; count
```
Although it looks like we might have obtained the whole data frame from the database, we didn't!
-It's a *reference*; the data is still stored only in the SQLite database. The `canlang_table` object
+It's a *reference*; the data is still stored only in the SQLite database. The `canlang_table` object
is an `AlchemyTable` (`ibis` is using `sqlalchemy` under the hood!), which, when printed, tells
you which columns are available in the table. But unlike a usual `pandas` data frame,
we do not immediately know how many rows are in the table. In order to find out how many
@@ -665,7 +673,7 @@ canlang_table.count()
```{index} execute, ibis; execute
```
-Wait a second...this isn't the number of rows in the database. In fact, we haven't actually sent our
+Wait a second...this isn't the number of rows in the database. In fact, we haven't actually sent our
SQL query to the database yet! We need to explicitly tell `ibis` when we want to send the query.
The reason for this is that databases are often more efficient at working with (i.e., selecting, filtering,
joining, etc.) large data sets than Python. And typically, the database will not even
@@ -693,23 +701,24 @@ str(canlang_table.count().compile())
The output above shows the SQL code that is sent to the database. When we
write `canlang_table.count().execute()` in Python, in the background, the `execute` function is
translating the Python code into SQL, sending that SQL to the database, and then translating the
-response for us. So `ibis` does all the hard work of translating from Python to SQL and back for us;
-we can just stick with Python!
+response for us. So `ibis` does all the hard work of translating from Python to SQL and back for us;
+we can just stick with Python!
The `ibis` package provides lots of `pandas`-like tools for working with database tables.
-For example, we can look at the first few rows of the table by using the `head` function---and
+For example, we can look at the first few rows of the table by using the `head` function---and
we won't forget to `execute` to see the result!
```{index} database; head, ibis;
```
```{code-cell} ipython3
+:tags: ["output_scroll"]
canlang_table.head(10).execute()
```
You can see that `ibis` actually returned a `pandas` data frame to us after we executed the query,
which is very convenient for working with the data after getting it from the database.
-So now that we have the `canlang_table` table reference for the 2016 Canadian Census data in hand, we
+So now that we have the `canlang_table` table reference for the 2016 Canadian Census data in hand, we
can mostly continue onward as if it were a regular data frame. For example, let's do the same exercise
from Chapter 1: we will obtain only those rows corresponding to Aboriginal languages, and keep only
the `language` and `mother_tongue` columns.
@@ -723,7 +732,7 @@ to obtain only certain rows. Below we filter the data to include only Aboriginal
canlang_table_filtered = canlang_table[canlang_table["category"] == "Aboriginal languages"]
canlang_table_filtered
```
-Above you can see that we have not yet executed this command; `canlang_table_filtered` is just showing
+Above you can see that we have not yet executed this command; `canlang_table_filtered` is just showing
the first part of our query (the part that starts with `Selection[r0]` above).
We didn't call `execute` because we are not ready to bring the data into Python yet.
We can still use the database to do some work to obtain *only* the small amount of data we want to work with locally
@@ -746,7 +755,7 @@ aboriginal_lang_data
`ibis` provides many more functions (not just the `[]` operation)
that you can use to manipulate the data within the database before calling
-`execute` to obtain the data in Python. But `ibis` does not provide *every* function
+`execute` to obtain the data in Python. But `ibis` does not provide *every* function
that we need for analysis; we do eventually need to call `execute`.
For example, `ibis` does not provide the `tail` function to look at the last
rows in a database, even though `pandas` does.
@@ -755,6 +764,7 @@ rows in a database, even though `pandas` does.
```
```{code-cell} ipython3
+:tags: ["output_scroll"]
canlang_table_selected.tail(6)
```
@@ -768,14 +778,14 @@ But be very careful using `execute`: databases are often *very* big,
and reading an entire table into Python might take a long time to run or even possibly
crash your machine. So make sure you select and filter the database table
to reduce the data to a reasonable size before using `execute` to read it into Python!
-
-### Reading data from a PostgreSQL database
+
+### Reading data from a PostgreSQL database
```{index} database; PostgreSQL
```
PostgreSQL (also called Postgres) is a very popular
-and open-source option for relational database software.
+and open-source option for relational database software.
Unlike SQLite,
PostgreSQL uses a clientāserver database engine, as it was designed to be used
and accessed on a network. This means that you have to provide more information
@@ -790,13 +800,13 @@ need to include when you call the `connect` function is listed below:
Below we demonstrate how to connect to a version of
the `can_mov_db` database, which contains information about Canadian movies.
-Note that the `host` (`fakeserver.stat.ubc.ca`), `user` (`user0001`), and
-`password` (`abc123`) below are *not real*; you will not actually
+Note that the `host` (`fakeserver.stat.ubc.ca`), `user` (`user0001`), and
+`password` (`abc123`) below are *not real*; you will not actually
be able to connect to a database using this information.
```python
conn = ibis.postgres.connect(
- database = "can_mov_db",
+ database = "can_mov_db",
host = "fakeserver.stat.ubc.ca",
port = 5432,
user = "user0001",
@@ -819,7 +829,7 @@ conn.list_tables()
We see that there are 10 tables in this database. Let's first look at the
`"ratings"` table to find the lowest rating that exists in the `can_mov_db`
-database.
+database.
```python
ratings_table = conn.table("ratings")
@@ -887,18 +897,18 @@ then use `ibis` to translate `pandas`-like
commands (the `[]` operation, `head`, etc.) into SQL queries that the database
understands, and then finally `execute` them. And not all `pandas` commands can currently be translated
via `ibis` into database queries. So you might be wondering: why should we use
-databases at all?
+databases at all?
Databases are beneficial in a large-scale setting:
- They enable storing large data sets across multiple computers with backups.
- They provide mechanisms for ensuring data integrity and validating input.
- They provide security and data access control.
-- They allow multiple users to access data simultaneously
+- They allow multiple users to access data simultaneously
and remotely without conflicts and errors.
- For example, there are billions of Google searches conducted daily in 2021 {cite:p}`googlesearches`.
- Can you imagine if Google stored all of the data
- from those searches in a single `.csv` file!? Chaos would ensue!
+ For example, there are billions of Google searches conducted daily in 2021 {cite:p}`googlesearches`.
+ Can you imagine if Google stored all of the data
+ from those searches in a single `.csv` file!? Chaos would ensue!
## Writing data from Python to a `.csv` file
@@ -910,7 +920,7 @@ that has changed (through selecting columns, filtering rows, etc.)
to a file to share it with others or use it for another step in the analysis.
The most straightforward way to do this is to use the `to_csv` function
from the `pandas` package. The default
-arguments are to use a comma (`,`) as the separator, and to include column names
+arguments are to use a comma (`,`) as the separator, and to include column names
in the first row. We also specify `index = False` to tell `pandas` not to print
row numbers in the `.csv` file. Below we demonstrate creating a new version of the Canadian
languages data set without the "Official languages" category according to the
@@ -921,18 +931,18 @@ no_official_lang_data = canlang_data[canlang_data["category"] != "Official langu
no_official_lang_data.to_csv("data/no_official_languages.csv", index=False)
```
-% ## Obtaining data from the web
-%
+% ## Obtaining data from the web
+%
% > **Note:** This section is not required reading for the remainder of the textbook. It
% > is included for those readers interested in learning a little bit more about
% > how to obtain different types of data from the web.
-%
+%
% ```{index} see: application programming interface; API
% ```
-%
+%
% ```{index} API
% ```
-%
+%
% Data doesn't just magically appear on your computer; you need to get it from
% somewhere. Earlier in the chapter we showed you how to access data stored in a
% plain text, spreadsheet-like format (e.g., comma- or tab-separated) from a web
@@ -946,16 +956,16 @@ no_official_lang_data.to_csv("data/no_official_languages.csv", index=False)
% data they have access to, and *how much* data they can access. Typically, the
% website owner will give you a *token* (a secret string of characters somewhat
% like a password) that you have to provide when accessing the API.
-%
+%
% ```{index} web scraping, CSS, HTML
% ```
-%
+%
% ```{index} see: hypertext markup language; HTML
% ```
-%
+%
% ```{index} see: cascading style sheet; CSS
% ```
-%
+%
% Another interesting thought: websites themselves *are* data! When you type a
% URL into your browser window, your browser asks the *web server* (another
% computer on the internet whose job it is to respond to requests for the
@@ -963,117 +973,117 @@ no_official_lang_data.to_csv("data/no_official_languages.csv", index=False)
% data into something you can see. If the website shows you some information that
% you're interested in, you could *create* a data set for yourself by copying and
% pasting that information into a file. This process of taking information
-% directly from what a website displays is called
+% directly from what a website displays is called
% *web scraping* (or sometimes *screen scraping*). Now, of course, copying and pasting
% information manually is a painstaking and error-prone process, especially when
% there is a lot of information to gather. So instead of asking your browser to
% translate the information that the web server provides into something you can
% see, you can collect that data programmatically—in the form of
-% **h**yper**t**ext **m**arkup **l**anguage
-% (HTML)
-% and **c**ascading **s**tyle **s**heet (CSS) code—and process it
+% **h**yper**t**ext **m**arkup **l**anguage
+% (HTML)
+% and **c**ascading **s**tyle **s**heet (CSS) code—and process it
% to extract useful information. HTML provides the
% basic structure of a site and tells the webpage how to display the content
% (e.g., titles, paragraphs, bullet lists etc.), whereas CSS helps style the
-% content and tells the webpage how the HTML elements should
-% be presented (e.g., colors, layouts, fonts etc.).
-%
+% content and tells the webpage how the HTML elements should
+% be presented (e.g., colors, layouts, fonts etc.).
+%
% This subsection will show you the basics of both web scraping
% with the [`BeautifulSoup` Python package](https://beautiful-soup-4.readthedocs.io/en/latest/) {cite:p}`beautifulsoup`
% and accessing the Twitter API
% using the [`tweepy` Python package](https://github.com/tweepy/tweepy) {cite:p}`tweepy`.
-%
+%
% +++
-%
+%
% ### Web scraping
-%
+%
% #### HTML and CSS selectors
-%
+%
% ```{index} web scraping, HTML; selector, CSS; selector, Craiglist
% ```
-%
+%
% When you enter a URL into your browser, your browser connects to the
% web server at that URL and asks for the *source code* for the website.
-% This is the data that the browser translates
+% This is the data that the browser translates
% into something you can see; so if we
% are going to create our own data by scraping a website, we have to first understand
% what that data looks like! For example, let's say we are interested
% in knowing the average rental price (per square foot) of the most recently
-% available one-bedroom apartments in Vancouver
+% available one-bedroom apartments in Vancouver
% on [Craiglist](https://vancouver.craigslist.org). When we visit the Vancouver Craigslist
-% website and search for one-bedroom apartments,
+% website and search for one-bedroom apartments,
% we should see something similar to {numref}`fig:craigslist-human`.
-%
+%
% +++
-%
+%
% ```{figure} img/craigslist_human.png
% :name: fig:craigslist-human
-%
+%
% Craigslist webpage of advertisements for one-bedroom apartments.
% ```
-%
+%
% +++
-%
+%
% Based on what our browser shows us, it's pretty easy to find the size and price
% for each apartment listed. But we would like to be able to obtain that information
% using Python, without any manual human effort or copying and pasting. We do this by
% examining the *source code* that the web server actually sent our browser to
-% display for us. We show a snippet of it below; the
-% entire source
+% display for us. We show a snippet of it below; the
+% entire source
% is [included with the code for this book](https://github.com/UBC-DSCI/introduction-to-datascience-python/blob/main/source/img/website_source.txt):
-%
+%
% ```html
%
% $800
-%
+%
%
% 1br -
%
-%
+%
% (13768 108th Avenue)
-%
+%
%
% map
%
-%
+%
%
% hide this posting
%
-%
+%
%
% restore
% restore this posting
%
-%
+%
%
%
%
%
-%
+%
% $2285
%
% ```
-%
+%
% Oof...you can tell that the source code for a web page is not really designed
% for humans to understand easily. However, if you look through it closely, you
% will find that the information we're interested in is hidden among the muck.
% For example, near the top of the snippet
% above you can see a line that looks like
-%
+%
% ```html
% $800
% ```
-%
+%
% That is definitely storing the price of a particular apartment. With some more
% investigation, you should be able to find things like the date and time of the
% listing, the address of the listing, and more. So this source code most likely
% contains all the information we are interested in!
-%
+%
% ```{index} HTML; tag
% ```
-%
+%
% Let's dig into that line above a bit more. You can see that
% that bit of code has an *opening tag* (words between `<` and `>`, like
% ``) and a *closing tag* (the same with a slash, like ``). HTML
@@ -1087,86 +1097,86 @@ no_official_lang_data.to_csv("data/no_official_languages.csv", index=False)
% apartment prices, maybe we can look for all the tags with the `"result-price"`
% class, and grab the information between the opening and closing tag. Indeed,
% take a look at another line of the source snippet above:
-%
+%
% ```html
% $2285
% ```
-%
+%
% It's yet another price for an apartment listing, and the tags surrounding it
% have the `"result-price"` class. Wonderful! Now that we know what pattern we
% are looking for—a dollar amount between opening and closing tags that have the
-% `"result-price"` class—we should be able to use code to pull out all of the
+% `"result-price"` class—we should be able to use code to pull out all of the
% matching patterns from the source code to obtain our data. This sort of "pattern"
% is known as a *CSS selector* (where CSS stands for **c**ascading **s**tyle **s**heet).
-%
-% The above was a simple example of "finding the pattern to look for"; many
+%
+% The above was a simple example of "finding the pattern to look for"; many
% websites are quite a bit larger and more complex, and so is their website
% source code. Fortunately, there are tools available to make this process
-% easier. For example,
-% [SelectorGadget](https://selectorgadget.com/) is
-% an open-source tool that simplifies identifying the generating
-% and finding of CSS selectors.
+% easier. For example,
+% [SelectorGadget](https://selectorgadget.com/) is
+% an open-source tool that simplifies identifying the generating
+% and finding of CSS selectors.
% At the end of the chapter in the additional resources section, we include a link to
-% a short video on how to install and use the SelectorGadget tool to
-% obtain CSS selectors for use in web scraping.
-% After installing and enabling the tool, you can click the
-% website element for which you want an appropriate selector. For
+% a short video on how to install and use the SelectorGadget tool to
+% obtain CSS selectors for use in web scraping.
+% After installing and enabling the tool, you can click the
+% website element for which you want an appropriate selector. For
% example, if we click the price of an apartment listing, we
% find that SelectorGadget shows us the selector `.result-price`
% in its toolbar, and highlights all the other apartment
% prices that would be obtained using that selector ({numref}`fig:sg1`).
-%
+%
% ```{figure} img/sg1.png
% :name: fig:sg1
-%
+%
% Using the SelectorGadget on a Craigslist webpage to obtain the CCS selector useful for obtaining apartment prices.
% ```
-%
+%
% If we then click the size of an apartment listing, SelectorGadget shows us
% the `span` selector, and highlights many of the lines on the page; this indicates that the
-% `span` selector is not specific enough to capture only apartment sizes ({numref}`fig:sg3`).
-%
+% `span` selector is not specific enough to capture only apartment sizes ({numref}`fig:sg3`).
+%
% ```{figure} img/sg3.png
% :name: fig:sg3
-%
+%
% Using the SelectorGadget on a Craigslist webpage to obtain a CCS selector useful for obtaining apartment sizes.
% ```
-%
+%
% To narrow the selector, we can click one of the highlighted elements that
-% we *do not* want. For example, we can deselect the "pic/map" links,
+% we *do not* want. For example, we can deselect the "pic/map" links,
% resulting in only the data we want highlighted using the `.housing` selector ({numref}`fig:sg2`).
-%
+%
% ```{figure} img/sg2.png
% :name: fig:sg2
-%
+%
% Using the SelectorGadget on a Craigslist webpage to refine the CCS selector to one that is most useful for obtaining apartment sizes.
% ```
-%
+%
% So to scrape information about the square footage and rental price
% of apartment listings, we need to use
% the two CSS selectors `.housing` and `.result-price`, respectively.
% The selector gadget returns them to us as a comma-separated list (here
% `.housing , .result-price`), which is exactly the format we need to provide to
% Python if we are using more than one CSS selector.
-%
+%
% **Stop! Are you allowed to scrape that website?**
-%
+%
% ```{index} web scraping; permission
% ```
-%
+%
% +++
-%
+%
% *Before* scraping data from the web, you should always check whether or not
% you are *allowed* to scrape it! There are two documents that are important
% for this: the `robots.txt` file and the Terms of Service
% document. If we take a look at [Craigslist's Terms of Service document](https://www.craigslist.org/about/terms.of.use),
-% we find the following text: *"You agree not to copy/collect CL content
+% we find the following text: *"You agree not to copy/collect CL content
% via robots, spiders, scripts, scrapers, crawlers, or any automated or manual equivalent (e.g., by hand)."*
% So unfortunately, without explicit permission, we are not allowed to scrape the website.
-%
+%
% ```{index} Wikipedia
% ```
-%
+%
% What to do now? Well, we *could* ask the owner of Craigslist for permission to scrape.
% However, we are not likely to get a response, and even if we did they would not likely give us permission.
% The more realistic answer is that we simply cannot scrape Craigslist. If we still want
@@ -1174,122 +1184,122 @@ no_official_lang_data.to_csv("data/no_official_languages.csv", index=False)
% To continue learning how to scrape data from the web, let's instead
% scrape data on the population of Canadian cities from Wikipedia.
% We have checked the [Terms of Service document](https://foundation.wikimedia.org/wiki/Terms_of_Use/en),
-% and it does not mention that web scraping is disallowed.
+% and it does not mention that web scraping is disallowed.
% We will use the SelectorGadget tool to pick elements that we are interested in
-% (city names and population counts) and deselect others to indicate that we are not
+% (city names and population counts) and deselect others to indicate that we are not
% interested in them (province names), as shown in {numref}`fig:sg4`.
-%
+%
% ```{figure} img/selectorgadget-wiki-updated.png
% :name: fig:sg4
-%
+%
% Using the SelectorGadget on a Wikipedia webpage.
% ```
-%
+%
% We include a link to a short video tutorial on this process at the end of the chapter
% in the additional resources section. SelectorGadget provides in its toolbar
% the following list of CSS selectors to use:
-%
+%
% +++
-%
+%
% ```code
-% td:nth-child(8) ,
-% td:nth-child(6) ,
-% td:nth-child(4) ,
+% td:nth-child(8) ,
+% td:nth-child(6) ,
+% td:nth-child(4) ,
% .mw-parser-output div tr+ tr td:nth-child(2)
% ```
-%
+%
% +++
-%
+%
% Now that we have the CSS selectors that describe the properties of the elements
% that we want to target (e.g., has a tag name `price`), we can use them to find
% certain elements in web pages and extract data.
-%
+%
% +++
-%
+%
% **Using `pandas.read_html`**
-%
+%
% +++
-%
+%
% The easiest way to read a table from HTML is to use [`pandas.read_html`](https://pandas.pydata.org/docs/reference/api/pandas.read_html.html). We can see that the Wikipedia page of "Canada" has 18 tables.
-%
+%
% ```{code-cell} ipython3
% :tags: [remove-output]
-%
+%
% canada_wiki = pd.read_html("https://en.wikipedia.org/wiki/Canada")
% len(canada_wiki)
% ```
-%
+%
% ```
% 18
% ```
-%
+%
% +++
-%
+%
% With some inspection, we find that the table that shows the population of the most populated provinces is of index 1.
-%
+%
% ```{code-cell} ipython3
% :tags: [remove-output]
-%
+%
% df = canada_wiki[1]
% df.columns = df.columns.droplevel()
% df
% ```
-%
+%
% ```{code-cell} ipython3
% :tags: [remove-input]
-%
+%
% df = pd.read_csv("data/canada-wiki-read_html.csv", index_col=0)
% df
% ```
-%
+%
% **Using `BeautifulSoup`**
-%
+%
% ```{index} BeautifulSoup, requests
% ```
-%
+%
% Now that we have our CSS selectors we can use the `requests` and `BeautifulSoup` Python packages to scrape our desired data from the website. We start by loading the packages:
-%
+%
% ```{code-cell} ipython3
% import requests
% from bs4 import BeautifulSoup
% ```
-%
+%
% Next, we tell Python what page we want to scrape by providing the webpage's URL in quotations to the function `requests.get` and pass it into the `BeautifulSoup` function for parsing:
-%
+%
% ```{code-cell} ipython3
% wiki = requests.get("https://en.wikipedia.org/wiki/Canada")
% page = BeautifulSoup(wiki.content, "html.parser")
% ```
-%
+%
% The `requests.get` function sends a `GET` request to the specified URL and returns the server's response to the HTTP request (*i.e.* a `requests.Response` object). The `BeautifulSoup` function takes the content of the response and returns the HTML source code itself, which we have
% stored in the `page` variable. Next, we use the `select` method of the page object along with the CSS selectors we obtained from the SelectorGadget tool. Make sure to surround the selectors with quotation marks; `select` expects that
-% argument is a string. It selects *nodes* from the HTML document that
+% argument is a string. It selects *nodes* from the HTML document that
% match the CSS selectors you specified. A *node* is an HTML tag pair (e.g.,
% `
` and `
` which defines the cell of a table) combined with the content
% stored between the tags. For our CSS selector `td:nth-child(6)`, an example
% node that would be selected would be:
-%
+%
% +++
-%
+%
% ```
%
% ```
-%
+%
% +++
-%
+%
% We store the result of the `select` function in the `population_nodes` variable. Note that it returns a list, and we slice the list to only print the first 5 elements.
-%
+%
% ```{code-cell} ipython3
% :tags: [remove-output]
-%
+%
% population_nodes = page.select(
% "td:nth-child(8) , td:nth-child(6) , td:nth-child(4) , .mw-parser-output div td:nth-child(2)"
% )
% population_nodes[:5]
% ```
-%
+%
% ```
% [