diff --git a/Dockerfile b/Dockerfile index 8e57cda2..1a31ee60 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,6 +18,7 @@ RUN mamba install --quiet --yes \ 'numpy' \ 'jinja2' \ 'altair_data_server' \ + 'altair_saver' \ 'click' \ 'ibis-framework' \ 'ghp-import' \ diff --git a/build_html.sh b/build_html.sh index a7a3f798..f68c05c9 100755 --- a/build_html.sh +++ b/build_html.sh @@ -1,2 +1,2 @@ chmod -R o+w source/ -docker run --rm -v $(pwd):/home/jovyan ubcdsci/py-intro-to-ds:202212191809333bdc71 /bin/bash -c "jupyter-book build source" +docker run --rm -v $(pwd):/home/jovyan ubcdsci/py-intro-to-ds:20230104230634037f38 /bin/bash -c "jupyter-book build source" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 7e821e45..00000000 --- a/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -jupyter-book -matplotlib -numpy diff --git a/source/_config.yml b/source/_config.yml index 93a176b0..779e497b 100644 --- a/source/_config.yml +++ b/source/_config.yml @@ -1,11 +1,7 @@ -####################################################################################### -# Config file for EOSC211 jupyter book -####################################################################################### # Book settings - -title: DSCĪ™ 100 +title: "Data Science: A First Introduction (Python Edition)" author: UBC -copyright: "2021" # Copyright year to be placed in the footer +copyright: "2022" # Copyright year to be placed in the footer logo: "" # A path to the book logo # Patterns to skip when building the book. Can be glob-style (e.g. "*skip.ipynb") exclude_patterns: [_build, Thumbs.db, .DS_Store, "*.ipynb_checkpoints"] @@ -15,10 +11,10 @@ only_build_toc_files: true ####################################################################################### # Execution settings execute: - execute_notebooks: "cache" # Whether to execute notebooks at build time. Must be one of ("auto", "force", "cache", "off") + execute_notebooks: "auto" # Whether to execute notebooks at build time. Must be one of ("auto", "force", "cache", "off") cache: "" # A path to the jupyter cache that will be used to store execution artifacts. Defaults to `_build/.jupyter_cache/` # exclude_patterns: [] # A list of patterns to *skip* in execution (e.g. a notebook that takes a really long time) - timeout: 30 # The maximum time (in seconds) each notebook cell is allowed to run. + timeout: 90 # The maximum time (in seconds) each notebook cell is allowed to run. run_in_temp: false # If `True`, then a temporary directory will be created and used as the command working directory (cwd), # otherwise the notebook's parent directory will be the cwd. @@ -65,19 +61,15 @@ latex: latex_engine: pdflatex # one of 'pdflatex', 'xelatex' (recommended for unicode), 'luatex', 'platex', 'uplatex' use_jupyterbook_latex: true # use sphinx-jupyterbook-latex for pdf builds as default - ####################################################################################### - # Launch button settings launch_buttons: binderhub_url: "" - - repository: - url: https://github.com/phaustin/eosc211_students # The URL to your book's repository - path_to_book: "" # A path to your book's folder, relative to the repository root. - branch: e211_live_main # Which branch of the repository should be used when creating links + url: https://github.com/UBC-DSCI/introduction-to-datascience-python # The URL to your book's repository + path_to_book: "source" # A path to your book's folder, relative to the repository root. + branch: production # Which branch of the repository should be used when creating links ####################################################################################### # Advanced and power-user settings diff --git a/source/_toc.yml b/source/_toc.yml index 61dfc676..58497d23 100644 --- a/source/_toc.yml +++ b/source/_toc.yml @@ -1,15 +1,18 @@ format: jb-book root: index.md -options: - numbered: true parts: -- caption: First draft +- caption: Front Matter chapters: - file: preface-text.md - - file: foreword-text.md + #- file: foreword.md - file: acknowledgements.md + - file: acknowledgements-python.md - file: authors.md - - file: setup.md + - file: editors.md + #- file: setup.md +- caption: Chapters + numbered: 3 + chapters: - file: intro.md - file: reading.md - file: wrangling.md @@ -20,5 +23,7 @@ parts: - file: regression2.md - file: clustering.md - file: inference.md - - file: references.md +- caption: Appendix + chapters: - file: appendixA.md + #- file: references.md diff --git a/source/acknowledgements-python.md b/source/acknowledgements-python.md new file mode 100644 index 00000000..dc687718 --- /dev/null +++ b/source/acknowledgements-python.md @@ -0,0 +1,25 @@ +--- +jupytext: + cell_metadata_filter: -all + formats: py:percent,md:myst,ipynb + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.13.8 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + +# Acknowledgments for the Python Edition + +We'd like to thank everyone that has contributed to the development of +[*Data Science: A First Introduction (Python Edition)*](https://ubc-dsci.github.io/introduction-to-datascience-python/). +This is an open source Python translation of the original [*Data Science: A First Introduction*](https://datasciencebook.ca); +the original focused on the R programming language. Both of these books are +used to teach DSCI 100, a new introductory data science course +at the University of British Columbia (UBC). + +We will finalize this acknowledgements section after the book is complete! diff --git a/source/acknowledgements.md b/source/acknowledgements.md index e0ec1699..82ecc5c7 100644 --- a/source/acknowledgements.md +++ b/source/acknowledgements.md @@ -13,7 +13,7 @@ kernelspec: name: python3 --- -# Acknowledgments -- TBD +# Acknowledgments We'd like to thank everyone that has contributed to the development of [*Data Science: A First Introduction*](https://datasciencebook.ca). diff --git a/source/appendixA.md b/source/appendixA.md index a1e1bcc3..7e57bf72 100644 --- a/source/appendixA.md +++ b/source/appendixA.md @@ -13,9 +13,7 @@ kernelspec: name: python3 --- -# Appendix - -# Downloading files from JupyterHub {#appendixA} +# Downloading files from JupyterHub This section will help you save your work from a JupyterHub web-based platform to your own computer. diff --git a/source/authors.md b/source/authors.md index b6465c76..7e6dc803 100644 --- a/source/authors.md +++ b/source/authors.md @@ -13,7 +13,7 @@ kernelspec: name: python3 --- -# About the authors -- TBD +# About the authors **Tiffany Timbers** is an Assistant Professor of Teaching in the Department of Statistics and Co-Director for the Master of Data Science program (Vancouver diff --git a/source/editors.md b/source/editors.md new file mode 100644 index 00000000..dedb5171 --- /dev/null +++ b/source/editors.md @@ -0,0 +1,51 @@ +--- +jupytext: + cell_metadata_filter: -all + formats: py:percent,md:myst,ipynb + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.13.8 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + +# About the editors of the Python Edition + +**Trevor Campbell** is an Assistant Professor in the Department of Statistics at +the University of British Columbia. His research focuses on automated, scalable +Bayesian inference algorithms, Bayesian nonparametrics, streaming data, and +Bayesian theory. He was previously a postdoctoral associate advised by Tamara +Broderick in the Computer Science and Artificial Intelligence Laboratory +(CSAIL) and Institute for Data, Systems, and Society (IDSS) at MIT, a Ph.D. +candidate under Jonathan How in the Laboratory for Information and Decision +Systems (LIDS) at MIT, and before that he was in the Engineering Science +program at the University of Toronto. + ++++ + +**Lindsey Heagy** is an Assistant Professor in the Department of Earth, Ocean, and Atmospheric +Sciences and director of the Geophysical Inversion Facility at the University of British Columbia. +Her research combines computational methods in numerical simulations, inversions, and machine +learning to answer questions about the subsurface of the Earth. Primary applications include +mineral exploration, carbon sequestration, groundwater and environmental studies. She +completed her BSc at the University of Alberta, her PhD at the University of British Columbia, +and held a Postdoctoral research position at the University of California Berkeley prior to +starting her current position at UBC. + ++++ + +**Joel Ostblom** is an Assistant Professor of Teaching in the Department of +Statistics at the University of British Columbia. +During his PhD, Joel developed a passion for data science and reproducibility +through the development of quantitative image analysis pipelines for studying +stem cell and developmental biology. He has since co-created or lead the +development of several courses and workshops at the University of Toronto and +is now an assistant professor of teaching in the statistics department at the +University of British Columbia. Joel cares deeply about spreading data literacy +and excitement over programmatic data analysis, which is reflected in his +contributions to open source projects and data science learning resources. You +can read more about Joel on his [personal page](https://joelostblom.com/). diff --git a/source/img/altair_syntax.png b/source/img/altair_syntax.png new file mode 100644 index 00000000..55676cdb Binary files /dev/null and b/source/img/altair_syntax.png differ diff --git a/source/img/code-figures.pptx b/source/img/code-figures.pptx new file mode 100644 index 00000000..e671a57b Binary files /dev/null and b/source/img/code-figures.pptx differ diff --git a/source/img/completion_menu.png b/source/img/completion_menu.png new file mode 100644 index 00000000..1de73d77 Binary files /dev/null and b/source/img/completion_menu.png differ diff --git a/source/img/data_frame_slides_cdn/data_frame_slides_cdn.001.jpeg b/source/img/data_frame_slides_cdn/data_frame_slides_cdn.001.jpeg index fa1e065d..276300cc 100644 Binary files a/source/img/data_frame_slides_cdn/data_frame_slides_cdn.001.jpeg and b/source/img/data_frame_slides_cdn/data_frame_slides_cdn.001.jpeg differ diff --git a/source/img/data_frame_slides_cdn/data_frame_slides_cdn.002.jpeg b/source/img/data_frame_slides_cdn/data_frame_slides_cdn.002.jpeg index 4fcf0966..b29831ee 100644 Binary files a/source/img/data_frame_slides_cdn/data_frame_slides_cdn.002.jpeg and b/source/img/data_frame_slides_cdn/data_frame_slides_cdn.002.jpeg differ diff --git a/source/img/data_frame_slides_cdn/data_frame_slides_cdn.004.jpeg b/source/img/data_frame_slides_cdn/data_frame_slides_cdn.004.jpeg index ae68f0d2..8675de1e 100644 Binary files a/source/img/data_frame_slides_cdn/data_frame_slides_cdn.004.jpeg and b/source/img/data_frame_slides_cdn/data_frame_slides_cdn.004.jpeg differ diff --git a/source/img/faithful_plot.png b/source/img/faithful_plot.png index fa93f603..a0e986de 100644 Binary files a/source/img/faithful_plot.png and b/source/img/faithful_plot.png differ diff --git a/source/img/faithful_plot.svg b/source/img/faithful_plot.svg index cf6ae779..21282faf 100644 --- a/source/img/faithful_plot.svg +++ b/source/img/faithful_plot.svgaiting time to next eruption - (minutes) -Eruption time - (minutes) - - +0102030405060708090100Waiting Time (mins)0.00.51.01.52.02.53.03.54.04.55.05.5Eruption Duration (mins) \ No newline at end of file diff --git a/source/img/filter_rows.png b/source/img/filter_rows.png new file mode 100644 index 00000000..5d15ca4f Binary files /dev/null and b/source/img/filter_rows.png differ diff --git a/source/img/filter_rows_and_columns.png b/source/img/filter_rows_and_columns.png new file mode 100644 index 00000000..124a7dc4 Binary files /dev/null and b/source/img/filter_rows_and_columns.png differ diff --git a/source/img/help_dialog.png b/source/img/help_dialog.png new file mode 100644 index 00000000..c2197ab7 Binary files /dev/null and b/source/img/help_dialog.png differ diff --git a/source/img/pivot_functions/pivot_functions.001.jpeg b/source/img/pivot_functions/pivot_functions.001.jpeg index f72151ba..fc5123f3 100644 Binary files a/source/img/pivot_functions/pivot_functions.001.jpeg and b/source/img/pivot_functions/pivot_functions.001.jpeg differ diff --git a/source/img/pivot_functions/pivot_functions.002.jpeg b/source/img/pivot_functions/pivot_functions.002.jpeg index 5e83772e..961c0813 100644 Binary files a/source/img/pivot_functions/pivot_functions.002.jpeg and b/source/img/pivot_functions/pivot_functions.002.jpeg differ diff --git a/source/img/read_csv_function.png b/source/img/read_csv_function.png new file mode 100644 index 00000000..4593eaa9 Binary files /dev/null and b/source/img/read_csv_function.png differ diff --git a/source/img/select_columns.png b/source/img/select_columns.png new file mode 100644 index 00000000..f316180d Binary files /dev/null and b/source/img/select_columns.png differ diff --git a/source/img/sort_values.png b/source/img/sort_values.png new file mode 100644 index 00000000..770ce22d Binary files /dev/null and b/source/img/sort_values.png differ diff --git a/source/img/summarize/summarize.001.jpeg b/source/img/summarize/summarize.001.jpeg index 1ffbaa57..7960e61e 100644 Binary files a/source/img/summarize/summarize.001.jpeg and b/source/img/summarize/summarize.001.jpeg differ diff --git a/source/img/summarize/summarize.002.jpeg b/source/img/summarize/summarize.002.jpeg index 5a6dbbd0..97995520 100644 Binary files a/source/img/summarize/summarize.002.jpeg and b/source/img/summarize/summarize.002.jpeg differ diff --git a/source/img/summarize/summarize.003.jpeg b/source/img/summarize/summarize.003.jpeg index a9d50b07..0a97f6be 100644 Binary files a/source/img/summarize/summarize.003.jpeg and b/source/img/summarize/summarize.003.jpeg differ diff --git a/source/img/summarize/summarize.004.jpeg b/source/img/summarize/summarize.004.jpeg index f3553dba..476ad698 100644 Binary files a/source/img/summarize/summarize.004.jpeg and b/source/img/summarize/summarize.004.jpeg differ diff --git a/source/img/summarize/summarize.005.jpeg b/source/img/summarize/summarize.005.jpeg index b2b1b2ca..d1a4f710 100644 Binary files a/source/img/summarize/summarize.005.jpeg and b/source/img/summarize/summarize.005.jpeg differ diff --git a/source/img/wrangling/pandas_dataframe_series-3.png b/source/img/wrangling/pandas_dataframe_series-3.png index a93bf397..6a2eea54 100644 Binary files a/source/img/wrangling/pandas_dataframe_series-3.png and b/source/img/wrangling/pandas_dataframe_series-3.png differ diff --git a/source/img/wrangling/pandas_dataframe_series.png b/source/img/wrangling/pandas_dataframe_series.png index 285a6559..75ffc893 100644 Binary files a/source/img/wrangling/pandas_dataframe_series.png and b/source/img/wrangling/pandas_dataframe_series.png differ diff --git a/source/img/wrangling/pandas_melt_args_labels.png b/source/img/wrangling/pandas_melt_args_labels.png index a1f9bd98..a24eb439 100644 Binary files a/source/img/wrangling/pandas_melt_args_labels.png and b/source/img/wrangling/pandas_melt_args_labels.png differ diff --git a/source/img/wrangling/pandas_melt_wide-long.png b/source/img/wrangling/pandas_melt_wide-long.png index 994e32a7..03b30975 100644 Binary files a/source/img/wrangling/pandas_melt_wide-long.png and b/source/img/wrangling/pandas_melt_wide-long.png differ diff --git a/source/img/wrangling/pandas_pivot_args_labels.png b/source/img/wrangling/pandas_pivot_args_labels.png index 7d57644c..0f961aaf 100644 Binary files a/source/img/wrangling/pandas_pivot_args_labels.png and b/source/img/wrangling/pandas_pivot_args_labels.png differ diff --git a/source/img/wrangling/pandas_pivot_long-wide.png b/source/img/wrangling/pandas_pivot_long-wide.png index 994e0510..faff307b 100644 Binary files a/source/img/wrangling/pandas_pivot_long-wide.png and b/source/img/wrangling/pandas_pivot_long-wide.png differ diff --git a/source/index.md b/source/index.md index 304a3606..be402176 100644 --- a/source/index.md +++ b/source/index.md @@ -13,19 +13,21 @@ kernelspec: name: python3 --- -# Welcome -- TBD +# Welcome! -This is the [website](https://datasciencebook.ca/) for *Data Science: A First Introduction*. +This is the [website](https://ubc-dsci.github.io/introduction-to-datascience-python/) for *Data Science: A First Introduction (Python Edition)*. You can read the web version of the book on this site. Click a section in the table of contents on the left side of the page to navigate to it. If you are on a mobile device, -you may need to open the table of contents first by clicking the menu button on +you may need to open the table of contents first by clicking the menu button on the top left of the page. -You can purchase a PDF or print copy of the book -on the [CRC Press website](https://www.routledge.com/Data-Science-A-First-Introduction/Timbers-Campbell-Lee/p/book/9780367524685) or on [Amazon](https://www.amazon.com/Data-Science-First-Introduction-Chapman/dp/0367532174/ref=sr_[…]qid=1644637450&sprefix=data+science+timber%2Caps%2C166&sr=8-1). + +For the R version of the textbook, please visit https://datasciencebook.ca. +You can purchase a PDF or print copy of the R version of the book +on the [CRC Press website](https://www.routledge.com/Data-Science-A-First-Introduction/Timbers-Campbell-Lee/p/book/9780367524685) or +on [Amazon](https://www.amazon.com/Data-Science-First-Introduction-Chapman/dp/0367532174/ref=sr_[…]qid=1644637450&sprefix=data+science+timber%2Caps%2C166&sr=8-1). -This work by [Tiffany Timbers](https://www.tiffanytimbers.com/), [Trevor Campbell](https://trevorcampbell.me/), -and [Melissa Lee](https://www.stat.ubc.ca/users/melissa-lee) is licensed under +This work by [Tiffany Timbers](https://www.tiffanytimbers.com/), [Trevor Campbell](https://trevorcampbell.me/), +and [Melissa Lee](https://www.stat.ubc.ca/users/melissa-lee) is licensed under a [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License](http://creativecommons.org/licenses/by-nc-sa/4.0/). - diff --git a/source/intro.md b/source/intro.md index 9683b4ef..bad9f768 100644 --- a/source/intro.md +++ b/source/intro.md @@ -24,9 +24,9 @@ from myst_nb import glue This chapter provides an introduction to data science and the Python programming language. The goal here is to get your hands dirty right from the start! We will walk through an entire data analysis, -and along the way introduce different types of data analysis question, some fundamental programming +and along the way introduce different types of data analysis question, some fundamental programming concepts in Python, and the basics of loading, cleaning, and visualizing data. In the following chapters, we will -dig into each of these steps in much more detail; but for now, let's jump in to see how much we can do +dig into each of these steps in much more detail; but for now, let's jump in to see how much we can do with data science! ## Chapter learning objectives @@ -38,7 +38,8 @@ By the end of the chapter, readers will be able to do the following: - Read tabular data with `read_csv`. - Use `help()` to access help and documentation tools in Python. - Create new variables and objects in Python. -- Create and organize subsets of tabular data using `[]`, `loc[]`, and `sort_values` +- Create and organize subsets of tabular data using `[]`, `loc[]`, and `sort_values`. +- Chain multiple operations in sequence. - Visualize data with an `altair` bar plot. ## Canadian languages data set @@ -47,7 +48,7 @@ By the end of the chapter, readers will be able to do the following: ``` In this chapter, we will walk through a full analysis of a data set relating to -languages spoken at home by Canadian residents. Many Indigenous peoples exist in Canada +languages spoken at home by Canadian residents. Many Indigenous peoples exist in Canada with their own cultures and languages; these languages are often unique to Canada and not spoken anywhere else in the world {cite:p}`statcan2018mothertongue`. Sadly, colonization has led to the loss of many of these languages. For instance, generations of @@ -55,18 +56,18 @@ children were not allowed to speak their mother tongue (the first language an individual learns in childhood) in Canadian residential schools. Colonizers also renamed places they had "discovered" {cite:p}`wilson2018`. Acts such as these have significantly harmed the continuity of Indigenous languages in Canada, and -some languages are considered "endangered" as few people report speaking them. -To learn more, please see *Canadian Geographic*'s article, "Mapping Indigenous Languages in -Canada" {cite:p}`walker2017`, -*They Came for the Children: Canada, Aboriginal -peoples, and Residential Schools* {cite:p}`children2012` -and the *Truth and Reconciliation Commission of Canada's* +some languages are considered "endangered" as few people report speaking them. +To learn more, please see *Canadian Geographic*'s article, "Mapping Indigenous Languages in +Canada" {cite:p}`walker2017`, +*They Came for the Children: Canada, Aboriginal +peoples, and Residential Schools* {cite:p}`children2012` +and the *Truth and Reconciliation Commission of Canada's* *Calls to Action* {cite:p}`calls2015`. -The data set we will study in this chapter is taken from -[the `canlang` R data package](https://ttimbers.github.io/canlang/) +The data set we will study in this chapter is taken from +[the `canlang` R data package](https://ttimbers.github.io/canlang/) {cite:p}`timbers2020canlang`, which has -population language data collected during the 2016 Canadian census {cite:p}`cancensus2016`. +population language data collected during the 2016 Canadian census {cite:p}`cancensus2016`. In this data, there are 214 languages recorded, each having six different properties: 1. `category`: Higher-level language category, describing whether the language is an Official Canadian language, an Aboriginal (i.e., Indigenous) language, or a Non-Official and Non-Aboriginal language. @@ -78,15 +79,15 @@ In this data, there are 214 languages recorded, each having six different proper According to the census, more than 60 Aboriginal languages were reported as being spoken in Canada. Suppose we want to know which are the most common; -then we might ask the following question, which we wish to answer using our data: +then we might ask the following question, which we wish to answer using our data: *Which ten Aboriginal languages were most often reported in 2016 as mother -tongues in Canada, and how many people speak each of them?* +tongues in Canada, and how many people speak each of them?* ```{index} data science; good practices ``` -> **Note:** Data science cannot be done without +> **Note:** Data science cannot be done without > a deep understanding of the data and > problem domain. In this book, we have simplified the data sets used in our > examples to concentrate on methods and fundamental concepts. But in real @@ -96,15 +97,15 @@ tongues in Canada, and how many people speak each of them?* > about *how* the data were collected, which affects the conclusions you can > draw. If your data are biased, then your results will be biased! -## Asking a question +## Asking a question Every good data analysis begins with a *question*—like the above—that you aim to answer using data. As it turns out, there are actually a number of different *types* of question regarding data: descriptive, exploratory, inferential, predictive, causal, and mechanistic, all of which are defined in {numref}`questions-table`. {cite:p}`leek2015question,peng2015art` -Carefully formulating a question as early as possible in your analysis—and -correctly identifying which type of question it is—will guide your overall approach to +Carefully formulating a question as early as possible in your analysis—and +correctly identifying which type of question it is—will guide your overall approach to the analysis as well as the selection of appropriate tools. ```{index} question; data analysis, descriptive question; definition, exploratory question; definition @@ -138,12 +139,12 @@ the analysis as well as the selection of appropriate tools. * - Mechanistic - A question that asks about the underlying mechanism of the observed patterns, trends, or relationships (i.e., how does it happen?) - How does wealth lead to voting for a certain political party in Canadian elections? - + ``` -In this book, you will learn techniques to answer the -first four types of question: descriptive, exploratory, predictive, and inferential; +In this book, you will learn techniques to answer the +first four types of question: descriptive, exploratory, predictive, and inferential; causal and mechanistic questions are beyond the scope of this book. In particular, you will learn how to apply the following analysis tools: @@ -153,25 +154,25 @@ In particular, you will learn how to apply the following analysis tools: ```{index} clustering; overview, estimation; overview ``` -1. **Summarization:** computing and reporting aggregated values pertaining to a data set. +1. **Summarization:** computing and reporting aggregated values pertaining to a data set. Summarization is most often used to answer descriptive questions, and can occasionally help with answering exploratory questions. -For example, you might use summarization to answer the following question: +For example, you might use summarization to answer the following question: *What is the average race time for runners in this data set?* Tools for summarization are covered in detail in the {ref}`reading` and {ref}`wrangling` chapters, but appear regularly throughout the text. -1. **Visualization:** plotting data graphically. +1. **Visualization:** plotting data graphically. Visualization is typically used to answer descriptive and exploratory questions, but plays a critical supporting role in answering all of the types of question in {numref}`questions-table`. For example, you might use visualization to answer the following question: -*Is there any relationship between race time and age for runners in this data set?* +*Is there any relationship between race time and age for runners in this data set?* This is covered in detail in the {ref}`viz` chapter, but again appears regularly throughout the book. 3. **Classification:** predicting a class or category for a new observation. Classification is used to answer predictive questions. For example, you might use classification to answer the following question: *Given measurements of a tumor's average cell area and perimeter, is the tumor benign or malignant?* Classification is covered in the {ref}`classification` and {ref}`classification2` chapters. -4. **Regression:** predicting a quantitative value for a new observation. +4. **Regression:** predicting a quantitative value for a new observation. Regression is also used to answer predictive questions. For example, you might use regression to answer the following question: *What will be the race time for a 20-year-old runner who weighs 50kg?* @@ -181,22 +182,22 @@ data set. Clustering is often used to answer exploratory questions. For example, you might use clustering to answer the following question: *What products are commonly bought together on Amazon?* Clustering is covered in the {ref}`clustering` chapter. -6. **Estimation:** taking measurements for a small number of items from a large group - and making a good guess for the average or proportion for the large group. Estimation +6. **Estimation:** taking measurements for a small number of items from a large group + and making a good guess for the average or proportion for the large group. Estimation is used to answer inferential questions. For example, you might use estimation to answer the following question: *Given a survey of cellphone ownership of 100 Canadians, what proportion -of the entire Canadian population own Android phones?* +of the entire Canadian population own Android phones?* Estimation is covered in the {ref}`inference` chapter. -Referring to {numref}`questions-table`, our question about +Referring to {numref}`questions-table`, our question about Aboriginal languages is an example of a *descriptive question*: we are summarizing the characteristics of a data set without further interpretation. And referring to the list above, it looks like we should use visualization and perhaps some summarization to answer the question. So in the remainder -of this chapter, we will work towards making a visualization that shows +of this chapter, we will work towards making a visualization that shows us the ten most common Aboriginal languages in Canada and their associated counts, -according to the 2016 census. +according to the 2016 census. ## Loading a tabular data set @@ -204,7 +205,7 @@ according to the 2016 census. ``` A data set is, at its core essence, a structured collection of numbers and characters. -Aside from that, there are really no strict rules; data sets can come in +Aside from that, there are really no strict rules; data sets can come in many different forms! Perhaps the most common form of data set that you will find in the wild, however, is *tabular data*. Think spreadsheets in Microsoft Excel: tabular data are rectangular-shaped and spreadsheet-like, as shown in {numref}`img-spreadsheet-vs-data frame`. In this book, we will focus primarily on tabular data. @@ -216,14 +217,14 @@ Since we are using Python for data analysis in this book, the first step for us load the data into Python. When we load tabular data into Python, it is represented as a *data frame* object. {numref}`img-spreadsheet-vs-data frame` shows that a Python data frame is very similar to a spreadsheet. We refer to the rows as **observations**; these are the things that we -collect the data on, e.g., voters, cities, etc. We refer to the columns as +collect the data on, e.g., voters, cities, etc. We refer to the columns as **variables**; these are the characteristics of those observations, e.g., voters' political -affiliations, cities' populations, etc. +affiliations, cities' populations, etc. ```{figure} img/spreadsheet_vs_df.png --- -height: 400px +height: 500px name: img-spreadsheet-vs-data frame --- A spreadsheet versus a data frame in Python @@ -239,7 +240,7 @@ The first kind of data file that we will learn how to load into Python as a data frame is the *comma-separated values* format (`.csv` for short). These files have names ending in `.csv`, and can be opened and saved using common spreadsheet programs like Microsoft Excel and Google Sheets. For example, the -`.csv` file named `can_lang.csv` +`.csv` file named `can_lang.csv` is included with [the code for this book](https://github.com/UBC-DSCI/introduction-to-datascience-python/tree/main/source/data). If we were to open this data in a plain text editor (a program like Notepad that just shows text with no formatting), we would see each row on its own line, and each entry in the table separated by a comma: @@ -264,7 +265,7 @@ To load this data into Python so that we can do things with it (e.g., perform analyses or create data visualizations), we will need to use a *function.* A function is a special word in Python that takes instructions (we call these *arguments*) and does something. The function we will use to load a `.csv` file -into Python is called `read_csv`. In its most basic +into Python is called `read_csv`. In its most basic use-case, `read_csv` expects that the data file: - has column names (or *headers*), @@ -280,14 +281,14 @@ Below you'll see the code used to load the data into Python using the `read_csv` function. Note that the `read_csv` function is not included in the base installation of Python, meaning that it is not one of the primary functions ready to use when you install Python. Therefore, you need to load it from somewhere else -before you can use it. The place from which we will load it is called a Python *package*. +before you can use it. The place from which we will load it is called a Python *package*. A Python package is a collection of functions that can be used in addition to the built-in Python package functions once loaded. The `read_csv` function, in -particular, can be made accessible by loading +particular, can be made accessible by loading [the `pandas` Python package](https://pypi.org/project/pandas/) {cite:p}`reback2020pandas,mckinney-proc-scipy-2010` using the `import` command. The `pandas` package contains many -functions that we will use throughout this book to load, clean, wrangle, -and visualize data. +functions that we will use throughout this book to load, clean, wrangle, +and visualize data. +++ @@ -296,25 +297,23 @@ import pandas as pd ``` This command has two parts. The first is `import pandas`, which loads the `pandas` package. -The second is `as pd`, which give the `pandas` package the much shorter *alias* (another name) `pd`. +The second is `as pd`, which give the `pandas` package the much shorter *alias* (another name) `pd`. We can now use the `read_csv` function by writing `pd.read_csv`, i.e., the package name, then a dot, then the function name. You can see why we gave `pandas` a shorter alias; if we had to type `pandas.` before every function we wanted to use, our code would become much longer and harder to read! -Now that the `pandas` package is loaded, we can use the `read_csv` function by passing +Now that the `pandas` package is loaded, we can use the `read_csv` function by passing it a single argument: the name of the file, `"can_lang.csv"`. We have to put quotes around file names and other letters and words that we use in our code to distinguish it from the special words (like functions!) that make up the Python programming language. The file's name is the only argument we need to provide because our file satisfies everything else that the `read_csv` function expects in the default use-case. {numref}`img-read-csv` describes how we use the `read_csv` -to read data into Python. - -**(FIGURE 1.2 FROM R BOOK IS NOT MISSING, BUT STILL R VERSION. NEEDS PD.READ_CSV)** +to read data into Python. -```{figure} img/read_csv_function.jpeg +```{figure} img/read_csv_function.png --- -height: 200px +height: 220px name: img-read-csv --- Syntax for the `read_csv` function @@ -323,6 +322,7 @@ Syntax for the `read_csv` function +++ ```{code-cell} ipython3 +:tags: ["output_scroll"] pd.read_csv("data/can_lang.csv") ``` @@ -332,11 +332,11 @@ pd.read_csv("data/can_lang.csv") ## Naming things in Python When we loaded the 2016 Canadian census language data -using `read_csv`, we did not give this data frame a name. -Therefore the data was just printed on the screen, -and we cannot do anything else with it. That isn't very useful. -What would be more useful would be to give a name -to the data frame that `read_csv` outputs, +using `read_csv`, we did not give this data frame a name. +Therefore the data was just printed on the screen, +and we cannot do anything else with it. That isn't very useful. +What would be more useful would be to give a name +to the data frame that `read_csv` outputs, so that we can refer to it later for analysis and visualization. ```{index} see: =; assignment symbol @@ -345,7 +345,7 @@ so that we can refer to it later for analysis and visualization. ```{index} assignment symbol, string ``` -The way to assign a name to a value in Python is via the *assignment symbol* `=`. +The way to assign a name to a value in Python is via the *assignment symbol* `=`. On the left side of the assignment symbol you put the name that you want to use, and on the right side of the assignment symbol you put the value that you want the name to refer to. @@ -360,17 +360,17 @@ my_number = 1 + 2 name = "Alice" ``` -Note that when -we name something in Python using the assignment symbol, `=`, -we do not need to surround the name we are creating with quotes. This is +Note that when +we name something in Python using the assignment symbol, `=`, +we do not need to surround the name we are creating with quotes. This is because we are formally telling Python that this special word denotes the value of whatever is on the right-hand side. Only characters and words that act as *values* on the right-hand side of the assignment -symbol—e.g., the file name `"data/can_lang.csv"` that we specified before, or `"Alice"` above—need +symbol—e.g., the file name `"data/can_lang.csv"` that we specified before, or `"Alice"` above—need to be surrounded by quotes. After making the assignment, we can use the special name words we have created in -place of their values. For example, if we want to do something with the value `3` later on, +place of their values. For example, if we want to do something with the value `3` later on, we can just use `my_number` instead. Let's try adding 2 to `my_number`; you will see that Python just interprets this as adding 2 and 3: @@ -397,7 +397,7 @@ SyntaxError: cannot assign to operator ```{index} object; naming convention ``` -There are certain conventions for naming objects in Python. +There are certain conventions for naming objects in Python. When naming an object we suggest using only lowercase letters, numbers and underscores `_` to separate the words in a name. Python is case sensitive, which means that `Letter` and @@ -408,23 +408,24 @@ remember what each name in your code represents. We recommend following the **PEP 8** naming conventions outlined in the *[PEP 8](https://peps.python.org/pep-0008/)* {cite:p}`pep8-style-guide`. Let's now use the assignment symbol to give the name `can_lang` to the 2016 Canadian census language data frame that we get from -`read_csv`. +`read_csv`. ```{code-cell} ipython3 can_lang = pd.read_csv("data/can_lang.csv") ``` Wait a minute, nothing happened this time! Where's our data? -Actually, something did happen: the data was loaded in -and now has the name `can_lang` associated with it. -And we can use that name to access the data frame and do things with it. -For example, we can type the name of the data frame to print both the first few rows +Actually, something did happen: the data was loaded in +and now has the name `can_lang` associated with it. +And we can use that name to access the data frame and do things with it. +For example, we can type the name of the data frame to print both the first few rows and the last few rows. The three dots (`...`) indicate that there are additional rows that are not printed. -You will also see that the number of observations (i.e., rows) and -variables (i.e., columns) are printed just underneath the data frame (214 rows and 6 columns in this case). +You will also see that the number of observations (i.e., rows) and +variables (i.e., columns) are printed just underneath the data frame (214 rows and 6 columns in this case). Printing a few rows from data frame like this is a handy way to get a quick sense for what is contained in it. ```{code-cell} ipython3 +:tags: ["output_scroll"] can_lang ``` @@ -435,8 +436,8 @@ can_lang Now that we've loaded our data into Python, we can start wrangling the data to find the ten Aboriginal languages that were most often reported -in 2016 as mother tongues in Canada. In particular, we want to construct -a table with the ten Aboriginal languages that have the largest +in 2016 as mother tongues in Canada. In particular, we want to construct +a table with the ten Aboriginal languages that have the largest counts in the `mother_tongue` column. The first step is to extract from our `can_lang` data only those rows that correspond to Aboriginal languages, and then the second step is to keep only the `language` and `mother_tongue` columns. @@ -457,8 +458,8 @@ and then use `loc[]` to do both in our analysis of the Aboriginal languages data Looking at the `can_lang` data above, we see the column `category` contains different high-level categories of languages, which include "Aboriginal languages", "Non-Official & Non-Aboriginal languages" and "Official languages". To answer -our question we want to filter our data set so we restrict our attention -to only those languages in the "Aboriginal languages" category. +our question we want to filter our data set so we restrict our attention +to only those languages in the "Aboriginal languages" category. ```{index} pandas.DataFrame; [], filter, logical statement, logical statement; equivalency operator, string ``` @@ -476,20 +477,18 @@ column---denoted by `can_lang["category"]`---with the value `"Aboriginal languag You will learn about many other kinds of logical statement in the {ref}`wrangling` chapter. Similar to when we loaded the data file and put quotes around the file name, here we need to put quotes around both `"Aboriginal languages"` and `"category"`. Using -quotes tells Python that this is a *string value* (e.g., a column name, or word data) -and not one of the special words that makes up the Python programming language, +quotes tells Python that this is a *string value* (e.g., a column name, or word data) +and not one of the special words that makes up the Python programming language, or one of the names we have given to objects in the code we have already written. > **Note:** In Python, single quotes (`'`) and double quotes (`"`) are generally -> treated the same. So we could have written `'Aboriginal languages'` instead +> treated the same. So we could have written `'Aboriginal languages'` instead > of `"Aboriginal languages"` above, or `'category'` instead of `"category"`. > Try both out for yourself! -**(This figure is wrong-- should be for [] operation below)** - -```{figure} img/read_csv_function.jpeg +```{figure} img/filter_rows.png --- -height: 200px +height: 220px name: img-filter --- Syntax for using the `[]` operation to filter rows. @@ -499,6 +498,7 @@ This operation returns a data frame that has all the columns of the input data f but only those rows corresponding to Aboriginal languages that we asked for in the logical statement. ```{code-cell} ipython3 +:tags: ["output_scroll"] can_lang[can_lang["category"] == "Aboriginal languages"] ``` @@ -513,16 +513,14 @@ We can also use the `[]` operation to select columns from a data frame. We again first type the name of the data frame---here, `can_lang`---followed by square brackets. Inside the square brackets, we provide a *list* of column names. In Python, we denote a *list* using square brackets, where -each item is separated by a comma (`,`). So if we are interested in +each item is separated by a comma (`,`). So if we are interested in selecting only the `language` and `mother_tongue` columns from our original `can_lang` data frame, we put the list `["language", "mother_tongue"]` containing those two column names inside the square brackets of the `[]` operation. -**(This figure is wrong-- should be for [] operation below)** - -```{figure} img/read_csv_function.jpeg +```{figure} img/select_columns.png --- -height: 200px +height: 220px name: img-select --- Syntax for using the `[]` operation to select columns. @@ -549,30 +547,30 @@ The syntax is very similar to the `[]` operation we have already covered: we wil essentially combine both our row filtering and column selection steps from before. In particular, we first write the name of the data frame---`can_lang` again---then follow that with the `.loc[]` method. Inside the square brackets, -we write our row filtering logical statement, +we write our row filtering logical statement, then a comma, then our list of columns to select. -**(This figure is wrong-- should be for .loc[] operation below)** - -```{figure} img/read_csv_function.jpeg +```{figure} img/filter_rows_and_columns.png --- -height: 200px +height: 220px name: img-loc --- Syntax for using the `loc[]` operation to filter rows and select columns. ``` ```{code-cell} ipython3 -aboriginal_lang = can_lang.loc[can_lang["category"] == "Aboriginal languages", ["language", "mother_tongue"]] +aboriginal_lang = can_lang.loc[ + can_lang["category"] == "Aboriginal languages", ["language", "mother_tongue"] +] ``` -There is one very important thing to notice in this code example. +There is one very important thing to notice in this code example. The first is that we used the `loc[]` operation on the `can_lang` data frame by writing `can_lang.loc[]`---first the data frame name, then a dot, then `loc[]`. There's that dot again! If you recall, earlier in this chapter we used the `read_csv` function from `pandas` (aliased as `pd`), and wrote `pd.read_csv`. The dot means that the thing on the left (`pd`, i.e., the `pandas` package) *provides* the thing on the right (the `read_csv` function). In the case of `can_lang.loc[]`, the thing on the left (the `can_lang` data frame) -*provides* the thing on the right (the `loc[]` operation). In Python, -both packages (like `pandas`) *and* objects (like our `can_lang` data frame) can provide functions +*provides* the thing on the right (the `loc[]` operation). In Python, +both packages (like `pandas`) *and* objects (like our `can_lang` data frame) can provide functions and other objects that we access using the dot syntax. At this point, if we have done everything correctly, `aboriginal_lang` should be a data frame @@ -585,7 +583,7 @@ aboriginal_lang ``` We can see the original `can_lang` data set contained 214 rows with multiple kinds of `category`. The data frame -`aboriginal_lang` contains only 67 rows, and looks like it only contains Aboriginal languages. +`aboriginal_lang` contains only 67 rows, and looks like it only contains Aboriginal languages. So it looks like the `loc[]` operation gave us the result we wanted! ### Using `sort_values` to order and `head` to select rows by value @@ -598,7 +596,7 @@ with only the Aboriginal languages in the data set and their associated counts. However, we want to know the **ten** languages that are spoken most often. As a next step, we will order the `mother_tongue` column from largest to smallest value and then extract only the top ten rows. This is where the `sort_values` -and `head` functions come to the rescue! +and `head` functions come to the rescue! The `sort_values` function allows us to order the rows of a data frame by the values of a particular column. We need to specify the column name @@ -609,7 +607,13 @@ language, we will use the `sort_values` function to order the rows in our arrange the rows in descending order (from largest to smallest), so we specify the argument `ascending` as `False`. -**(FIGURE 1.5 FROM R BOOK MISSING HERE)** +```{figure} img/sort_values.png +--- +height: 220px +name: img-sort-values +--- +Syntax for using `sort_values` to arrange rows in decending order. +``` ```{code-cell} ipython3 arranged_lang = aboriginal_lang.sort_values(by='mother_tongue', ascending=False) @@ -619,7 +623,7 @@ arranged_lang Next, we will obtain the ten most common Aboriginal languages by selecting only the first ten rows of the `arranged_lang` data frame. We do this using the `head` function, and specifying the argument -`10`. +`10`. ```{code-cell} ipython3 @@ -627,16 +631,134 @@ ten_lang = arranged_lang.head(10) ten_lang ``` -We have now answered our initial question by generating this table! +## Combining analysis steps with chaining and multiline expressions + +```{index} chaining methods +``` + +It took us 3 steps to find the ten Aboriginal languages most often reported in +2016 as mother tongues in Canada. Starting from the `can_lang` data frame, we: + +1) used `loc` to filter the rows so that only the + `Aboriginal languages` category remained, and selected the + `language` and `mother_tongue` columns, +2) used `sort_values` to sort the rows by `mother_tongue` in descending order, and +3) obtained only the top 10 values using `head`. + +One way of performing these steps is to just write +multiple lines of code, storing temporary, intermediate objects as you go. +```{code-cell} ipython3 +aboriginal_lang = can_lang.loc[can_lang["category"] == "Aboriginal languages", ["language", "mother_tongue"]] +arranged_lang_sorted = aboriginal_lang.sort_values(by='mother_tongue', ascending=False) +ten_lang = arranged_lang_sorted.head(10) +``` + +```{index} multi-line expression +``` + +You might find that code hard to read. You're not wrong; it is! +There are two main issues with readability here. First, each line of code is quite long. +It is hard to keep track of what methods are being called, and what arguments were used. +Second, each line introduces a new temporary object. In this case, both `aboriginal_lang` and `arranged_lang_sorted` +are just temporary results on the way to producing the `ten_lang` data frame. +This makes the code hard to read, as one has to trace where each temporary object +goes, and hard to understand, since introducing many named objects also suggests that they +are of some importance, when really they are just intermediates. +The need to call multiple methods in a sequence to process a data frame is +quite common, so this is an important issue to address! + +To solve the first problem, we can actually split the long expressions above across +multiple lines. Although in most cases, a single expression in Python must be contained +in a single line of code, there are a small number of situations where lets us do this. +Let's rewrite this code in a more readable format using multiline expressions. + +```{code-cell} ipython3 +aboriginal_lang = can_lang.loc[ + can_lang["category"] == "Aboriginal languages", ["language", "mother_tongue"] +] +arranged_lang_sorted = aboriginal_lang.sort_values( + by='mother_tongue', ascending=False +) +ten_lang = arranged_lang_sorted.head(10) +``` + +This code is the same as the code we showed earlier; you can see the same +sequence of methods and arguments is used. But long expressions are split +across multiple lines when they would otherwise get long and unwieldy, +improving the readability of the code. +How does Python know when to keep +reading on the next line for a single expression? +For the line starting with `aboriginal_lang = ...`, Python sees that the line ends with a left +bracket symbol `[`, and knows that our +expression cannot end until we close it with an appropriate corresponding right bracket symbol `]`. +We put the same two arguments as we did before, and then +the corresponding right bracket appears after `["language", "mother_tongue"]`). +For the line starting with `arranged_lang_sorted = ...`, Python sees that the line ends with a left parenthesis symbol `(`, +and knows the expression cannot end until we close it with the corresponding right parenthesis symbol `)`. +Again we use the same two arguments as before, and then the +corresponding right parenthesis appears right after `ascending=False`. +In both cases, Python keeps reading the next line to figure out +what the rest of the expression is. We could, of course, +put all of the code on one line of code, but splitting it across +multiple lines helps a lot with code readability. + +We still have to handle the issue that each line of code---i.e., each step in the analysis---introduces +a new temporary object. To address this issue, we can *chain* multiple operations together without +assigning intermediate objects. The key idea of chaining is that the *output* of +each step in the analysis is a data frame, which means that you can just directly keep calling methods +that operate on the output of each step in a sequence! This simplifies the code and makes it +easier to read. The code below demonstrates the use of both multiline expressions and chaining together. +The code is now much cleaner, and the `ten_lang` data frame that we get is equivalent to the one +from the messy code above! + +```{code-cell} ipython3 +# obtain the 10 most common Aboriginal languages +ten_lang = ( + can_lang.loc[ + can_lang["category"] == "Aboriginal languages", + ["language", "mother_tongue"] + ] + .sort_values(by="mother_tongue", ascending=False) + .head(10) +) +ten_lang +``` + +Let's parse this new block of code piece by piece. +The code above starts with a left parenthesis, `(`, and so Python +knows to keep reading to subsequent lines until it finds the corresponding +right parenthesis symbol `)`. The `loc` method performs the filtering and selecting steps as before. The line after this +starts with a period (`.`) that "chains" the output of the `loc` step with the next operation, +`sort_values`. Since the output of `loc` is a data frame, we can use the `sort_values` method on it +without first giving it a name! That is what the `.sort_values` does on the next line. +Finally, we once again "chain" together the output of `sort_values` with `head` to ask for the 10 +most common languages. Finally, the right parenthesis `)` corresponding to the very first left parenthesis +appears on the second last line, completing the multiline expression. +Instead of creating intermediate objects, with chaining, we take the output of +one operation and use that to perform the next operation. In doing so, we remove the need to create and +store intermediates. This can help with readability by simplifying the code. + +Now that we've shown you chaining as an alternative to storing +temporary objects and composing code, does this mean you should *never* store +temporary objects or compose code? Not necessarily! +There are times when temporary objects are handy to keep around. +For example, you might store a temporary object before feeding it into a plot function +so you can iteratively change the plot without having to +redo all of your data transformations. +Chaining many functions can be overwhelming and difficult to debug; +you may want to store a temporary object midway through to inspect your result +before moving on with further steps. + +We have now answered our initial question by generating the `ten_lang` table! Are we done? Well, not quite; tables are almost never the best way to present the result of your analysis to your audience. Even the simple table above with only two columns presents some difficulty: for example, you have to scrutinize -the table quite closely to get a sense for the relative numbers of speakers of -each language. When you move on to more complicated analyses, this issue only -gets worse. In contrast, a *visualization* would convey this information in a much -more easily understood format. +the table quite closely to get a sense for the relative numbers of speakers of +each language. When you move on to more complicated analyses, this issue only +gets worse. In contrast, a *visualization* would convey this information in a much +more easily understood format. Visualizations are a great tool for summarizing information to help you -effectively communicate with your audience. +effectively communicate with your audience. ## Exploring data with visualizations @@ -644,7 +766,7 @@ effectively communicate with your audience. ``` Creating effective data visualizations is an essential component of any data -analysis. In this section we will develop a visualization of the +analysis. In this section we will develop a visualization of the ten Aboriginal languages that were most often reported in 2016 as mother tongues in Canada, as well as the number of people that speak each of them. @@ -670,9 +792,9 @@ formally introduce tidy data in the {ref}`wrangling` chapter. We will make a bar plot to visualize our data. A bar plot is a chart where the lengths of the bars represent certain values, like counts or proportions. We will make a bar plot using the `mother_tongue` and `language` columns from our -`ten_lang` data frame. To create a bar plot of these two variables using the +`ten_lang` data frame. To create a bar plot of these two variables using the `altair` package, we must specify the data frame, which variables -to put on the x and y axes, and what kind of plot to create. +to put on the x and y axes, and what kind of plot to create. First, we need to import the `altair` package. ```{code-cell} ipython3 @@ -683,16 +805,22 @@ import altair as alt +++ The fundamental object in `altair` is the `Chart`, which takes a data frame as a single argument: `alt.Chart(ten_lang)`. -With a chart object in hand, we can now specify how we would like the data to be visualized. -We first indicate what kind of geometric mark we want to use to represent the data. Here we set the mark attribute +With a chart object in hand, we can now specify how we would like the data to be visualized. +We first indicate what kind of geometric mark we want to use to represent the data. Here we set the mark attribute of the chart object using the `Chart.mark_bar` function, because we want to create a bar chart. -Next, we need to encode the variables of the data frame using -the `x` (represents the x-axis position of the points) and +Next, we need to encode the variables of the data frame using +the `x` (represents the x-axis position of the points) and `y` (represents the y-axis position of the points) *channels*. We use the `encode()` function to handle this: we specify that the `language` column should correspond to the x-axis, and that the `mother_tongue` column should correspond to the y-axis. -**(FIGURE 1.6 FROM R BOOK IS MISSING)** +```{figure} img/altair_syntax.png +--- +height: 220px +name: img-altair +--- +Syntax for using `altair` to make a bar chart. +``` +++ @@ -700,12 +828,9 @@ and that the `mother_tongue` column should correspond to the y-axis. :tags: [] barplot_mother_tongue = ( - alt.Chart(ten_lang) - .mark_bar().encode( - x="language", - y="mother_tongue" - )) - + alt.Chart(ten_lang).mark_bar().encode(x="language", y="mother_tongue") +) + ``` @@ -728,20 +853,6 @@ Bar plot of the ten Aboriginal languages most often reported by Canadian residen ```{index} see: .; chaining methods ``` -```{index} multi-line expression -``` - -> **Note:** The vast majority of the -> time, a single expression in Python must be contained in a single line of code. -> However, there *are* a small number of situations in which you can have a -> single Python expression span multiple lines. Above is one such case: here, Python sees that we put a left -> parenthesis symbol `(` on the first line right after the assignment symbol `=`, and knows that our -> expression cannot end until we close it with an appropriate corresponding right parenthesis symbol `)`. -> So Python keeps reading the next line to figure out -> what the rest of the expression is. We could, of course, -> put all of the code on one line of code, but splitting it across -> multiple lines helps a lot with code readability. - ### Formatting `altair` objects It is exciting that we can already visualize our data to help answer our @@ -760,8 +871,8 @@ Canadian Residents)" would be much more informative. ``` Adding additional labels to our visualizations that we create in `altair` is -one common and easy way to improve and refine our data visualizations. We can add titles for the axes -in the `altair` objects using `alt.X` and `alt.Y` with the `title` argument to make +one common and easy way to improve and refine our data visualizations. We can add titles for the axes +in the `altair` objects using `alt.X` and `alt.Y` with the `title` argument to make the axes titles more informative. Again, since we are specifying words (e.g. `"Mother Tongue (Number of Canadian Residents)"`) as arguments to @@ -795,7 +906,7 @@ Bar plot of the ten Aboriginal languages most often reported by Canadian residen ::: -The result is shown in {numref}`barplot-mother-tongue-labs`. +The result is shown in {numref}`barplot-mother-tongue-labs`. This is already quite an improvement! Let's tackle the next major issue with the visualization in {numref}`barplot-mother-tongue-labs`: the vertical x axis labels, which are currently making it difficult to read the different language names. @@ -830,14 +941,14 @@ Horizontal bar plot of the ten Aboriginal languages most often reported by Canad ```{index} altair; sort ``` -Another big step forward, as shown in {numref}`barplot-mother-tongue-labs-axis`! There +Another big step forward, as shown in {numref}`barplot-mother-tongue-labs-axis`! There are no more serious issues with the visualization. Now comes time to refine the visualization to make it even more well-suited to answering the question we asked earlier in this chapter. For example, the visualization could be made more transparent by organizing the bars according to the number of Canadian residents reporting each language, rather than in alphabetical order. We can reorder the bars using the `sort` argument, which orders a variable (here `language`) based on the -values of the variable(`mother_tongue`) on the `x-axis`. +values of the variable(`mother_tongue`) on the `x-axis`. ```{code-cell} ipython3 ordered_barplot_mother_tongue = ( @@ -864,7 +975,7 @@ glue('barplot-mother-tongue-reorder', ordered_barplot_mother_tongue, display=Tru :name: barplot-mother-tongue-reorder Bar plot of the ten Aboriginal languages most often reported by Canadian residents as their mother tongue with bars reordered. -::: +::: {numref}`barplot-mother-tongue-reorder` provides a very clear and well-organized @@ -878,7 +989,7 @@ n.o.s. with over 60,000 Canadian residents reporting it as their mother tongue. > Cree languages include the following categories: Cree n.o.s., Swampy Cree, > Plains Cree, Woods Cree, and a 'Cree not included elsewhere' category (which > includes Moose Cree, Northern East Cree and Southern East Cree) -> {cite:p}`language2016`. +> {cite:p}`language2016`. ### Putting it all together @@ -890,12 +1001,12 @@ n.o.s. with over 60,000 Canadian residents reporting it as their mother tongue. In the block of code below, we put everything from this chapter together, with a few modifications. In particular, we have combined all of our steps into one expression -split across multiple lines using the left and right parenthesis symbols `(` and `)`. -We have also provided *comments* next to +split across multiple lines using the left and right parenthesis symbols `(` and `)`. +We have also provided *comments* next to many of the lines of code below using the -hash symbol `#`. When Python sees a `#` sign, it +hash symbol `#`. When Python sees a `#` sign, it will ignore all of the text that -comes after the symbol on that line. So you can use comments to explain lines +comes after the symbol on that line. So you can use comments to explain lines of code for others, and perhaps more importantly, your future self! It's good practice to get in the habit of commenting your code to improve its readability. @@ -905,7 +1016,7 @@ performed an entire data science workflow with a highly effective data visualization! We asked a question, loaded the data into Python, wrangled the data (using `[]`, `loc[]`, `sort_values`, and `head`) and created a data visualization to help answer our question. In this chapter, you got a quick taste of the data -science workflow; continue on with the next few chapters to learn each of +science workflow; continue on with the next few chapters to learn each of these steps in much more detail! ```{code-cell} ipython3 @@ -956,16 +1067,16 @@ Bar plot of the ten Aboriginal languages most often reported by Canadian residen ```{index} see: __doc__; documentation ``` -There are many Python functions in the `pandas` package (and beyond!), and +There are many Python functions in the `pandas` package (and beyond!), and nobody can be expected to remember what every one of them does -or all of the arguments we have to give them. Fortunately, Python provides -the `help` function, which -provides an easy way to pull up the documentation for -most functions quickly. To use the `help` function to access the documentation, you +or all of the arguments we have to give them. Fortunately, Python provides +the `help` function, which +provides an easy way to pull up the documentation for +most functions quickly. To use the `help` function to access the documentation, you just put the name of the function you are curious about as an argument inside the `help` function. For example, if you had forgotten what the `pd.read_csv` function did or exactly what arguments to pass in, you could run the following -code: +code: ```{code-cell} ipython3 :tags: ["remove-output"] @@ -973,11 +1084,11 @@ help(pd.read_csv) ``` {numref}`help_read_csv` shows the documentation that will pop up, -including a high-level description of the function, its arguments, +including a high-level description of the function, its arguments, a description of each, and more. Note that you may find some of the text in the documentation a bit too technical right now. Fear not: as you work through this book, many of these terms will be introduced -to you, and slowly but surely you will become more adept at understanding and navigating +to you, and slowly but surely you will become more adept at understanding and navigating documentation like that shown in {numref}`help_read_csv`. But do keep in mind that the documentation is not written to *teach* you about a function; it is just there as a reference to *remind* you about the different arguments and usage of functions that you have already learned about elsewhere. @@ -994,14 +1105,55 @@ The documentation for the read_csv function including a high-level description, +++ -If you are working in a Jupyter Lab environment, there are also two more convenient -ways to access documentation for functions. **JOEL ADD TEXT AND IMAGES HERE**. +If you are working in a Jupyter Lab environment, there are some conveniences that will help you lookup function names +and access the documentation. +You can type the first characters of the function you want to use, +and then press Tab to bring up small menu +that shows you all the available functions +that starts with those characters. +This is helpful both for remembering function names +and to prevent typos. + ++++ + +```{figure} img/completion_menu.png +--- +height: 400px +name: completion_menu +--- +The suggestions that are shown after typing `pd.read` and pressing Tab. +``` + ++++ + +To get more info on the function you want to use, +you can type out the full name +and then hold Shift while pressing Tab +to bring up a help dialogue including the same information as when using `help()`. + ++++ + +```{figure} img/help_dialog.png +--- +height: 400px +name: help_dialog +--- +The help dialog that is shown after typing `pd.read_csv` and then pressing Shift + Tab. +``` + ++++ +Finally, +it can be helpful to have this help dialog open at all times, +especially when you start out learning about programming and data science. +You can achieve this by clicking on the `Help` text +in the menu bar at the top +and then selecting `Show Contextual Help`. ## Exercises -Practice exercises for the material covered in this chapter -can be found in the accompanying +Practice exercises for the material covered in this chapter +can be found in the accompanying [worksheets repository](https://github.com/UBC-DSCI/data-science-a-first-intro-python-worksheets#readme) in the "Python and Pandas" row. You can launch an interactive version of the worksheet in your browser by clicking the "launch binder" button. diff --git a/source/preface-text.md b/source/preface-text.md index 75ae6344..139fe55c 100644 --- a/source/preface-text.md +++ b/source/preface-text.md @@ -13,11 +13,16 @@ kernelspec: name: python3 --- -# Preface -- TBD +# Preface + +```{index} data science, auditable, reproducible +``` + + This textbook aims to be an approachable introduction to the world of data science. -In this book, we define **data science** \index{data science!definition} as the process of generating -insight from data through **reproducible** \index{reproducible} and **auditable** \index{auditable} processes. +In this book, we define **data science** as the process of generating +insight from data through **reproducible** and **auditable** processes. If you analyze some data and give your analysis to a friend or colleague, they should be able to re-run the analysis from start to finish and get the same result you did (*reproducibility*). They should also be able to see and understand all the steps in the analysis, as well as the history of how @@ -29,19 +34,17 @@ At a high level, in this book, you will learn how to (1) identify common problems in data science, and (2) solve those problems with reproducible and auditable workflows. -Figure \@ref(fig:img-chapter-overview) summarizes what you will learn in each chapter -of this book. -Throughout, you will learn how to use the R programming language [@Rlanguage] to perform +{numref}`preface-overview-fig` summarizes what you will learn in each chapter +of this book. Throughout, you will learn how to use the [Python programming language](https://www.python.org/) to perform all the tasks associated with data analysis. You will -spend the first four chapters learning how to use R to load, clean, wrangle +spend the first four chapters learning how to use Python to load, clean, wrangle (i.e., restructure the data into a usable format) and visualize data while answering descriptive and exploratory data analysis questions. In the next six chapters, you will learn how to answer predictive, exploratory, and inferential data analysis questions with common methods in data science, including classification, regression, clustering, and estimation. In the final chapters -(\@ref(getting-started-with-jupyter)–\@ref(move-to-your-own-machine)), -you will learn how to combine R code, formatted text, and images +you will learn how to combine Python code, formatted text, and images in a single coherent document with Jupyter, use version control for collaboration, and install and configure the software needed for data science on your own computer. If you are reading this book as part of a course that you are @@ -51,20 +54,26 @@ But if you are reading this independently, you may want to jump to these last th early before going on to make sure your computer is set up in such a way that you can try out the example code that we include throughout the book. -```{r img-chapter-overview, echo = FALSE, message = FALSE, warning = FALSE, fig.cap = "Where are we going?", out.width="100%", fig.retina = 2, fig.align = "center"} -knitr::include_graphics("img/chapter_overview.jpeg") +```{figure} img/chapter_overview.jpeg +--- +height: 400px +name: preface-overview-fig +--- +Where are we going? ``` + + Each chapter in the book has an accompanying worksheet that provides exercises to help you practice the concepts you will learn. We strongly recommend that you work through the worksheet when you finish reading each chapter before moving on to the next chapter. All of the worksheets are available at -[https://github.com/UBC-DSCI/data-science-a-first-intro-worksheets#readme](https://github.com/UBC-DSCI/data-science-a-first-intro-worksheets#readme); +[https://github.com/UBC-DSCI/data-science-a-first-intro-python-worksheets#readme](https://github.com/UBC-DSCI/data-science-a-first-intro-python-worksheets#readme); the "Exercises" section at the end of each chapter points you to the right worksheet for that chapter. For each worksheet, you can either launch an interactive version of the worksheet in your browser by clicking the "launch binder" button, or preview a non-interactive version of the worksheet by clicking "view worksheet." If you instead decide to download the worksheet and run it on your own machine, make sure to follow the instructions for computer setup -found in Chapter \@ref(move-to-your-own-machine). This will ensure that the automated feedback +found in the {ref}`move-to-your-own-machine` chapter. This will ensure that the automated feedback and guidance that the worksheets provide will function as intended. diff --git a/source/reading.md b/source/reading.md index 4febd2cd..4182df15 100644 --- a/source/reading.md +++ b/source/reading.md @@ -16,7 +16,7 @@ kernelspec: # Reading in data locally and from the web -## Overview +## Overview ```{index} see: loading; reading ``` @@ -46,10 +46,10 @@ By the end of the chapter, readers will be able to do the following: - **U**niform **R**esource **L**ocator (URL) - Read data into Python using an absolute path, relative path and a URL. - Compare and contrast the following functions: - - `read_csv` + - `read_csv` - `read_excel` - Match the following `pandas` `read_csv` function arguments to their descriptions: - - `filepath_or_buffer` + - `filepath_or_buffer` - `sep` - `names` - `skiprows` @@ -76,7 +76,7 @@ This chapter will discuss the different functions we can use to import data into Python, but before we can talk about *how* we read the data into Python with these functions, we first need to talk about *where* the data lives. When you load a data set into Python, you first need to tell Python where those files live. The file -could live on your computer (*local*) or somewhere on the internet (*remote*). +could live on your computer (*local*) or somewhere on the internet (*remote*). The place where the file lives on your computer is called the "path". You can think of the path as directions to the file. There are two kinds of paths: @@ -90,7 +90,7 @@ in respect to the computer's filesystem base (or root) folder. Suppose our computer's filesystem looks like the picture in {numref}`Filesystem`, and we are working in a -file titled `worksheet_02.ipynb`. If we want to +file titled `worksheet_02.ipynb`. If we want to read the `.csv` file named `happiness_report.csv` into Python, we could do this using either a relative or an absolute path. We show both choices below. @@ -124,24 +124,24 @@ happy_data = pd.read_csv("/home/dsci-100/worksheet_02/data/happiness_report.csv" +++ -So which one should you use? Generally speaking, to ensure your code can be run -on a different computer, you should use relative paths. An added bonus is that -it's also less typing! Generally, you should use relative paths because the file's -absolute path (the names of -folders between the computer's root `/` and the file) isn't usually the same -across different computers. For example, suppose Fatima and Jayden are working on a -project together on the `happiness_report.csv` data. Fatima's file is stored at +So which one should you use? Generally speaking, to ensure your code can be run +on a different computer, you should use relative paths. An added bonus is that +it's also less typing! Generally, you should use relative paths because the file's +absolute path (the names of +folders between the computer's root `/` and the file) isn't usually the same +across different computers. For example, suppose Fatima and Jayden are working on a +project together on the `happiness_report.csv` data. Fatima's file is stored at ``` /home/Fatima/project/data/happiness_report.csv ``` -while Jayden's is stored at +while Jayden's is stored at ``` /home/Jayden/project/data/happiness_report.csv ``` - + Even though Fatima and Jayden stored their files in the same place on their computers (in their home folders), the absolute paths are different due to their different usernames. If Jayden has code that loads the @@ -154,10 +154,10 @@ relative paths will work on both! ``` Your file could be stored locally, as we discussed, or it could also be -somewhere on the internet (remotely). For this purpose we use a +somewhere on the internet (remotely). For this purpose we use a *Uniform Resource Locator (URL)*, i.e., a web address that looks something like https://google.com/. URLs indicate the location of a resource on the internet and -helps us retrieve that resource. +helps us retrieve that resource. ## Reading tabular data from a plain text file into Python @@ -168,26 +168,26 @@ helps us retrieve that resource. ``` Now that we have learned about *where* data could be, we will learn about *how* -to import data into Python using various functions. Specifically, we will learn how +to import data into Python using various functions. Specifically, we will learn how to *read* tabular data from a plain text file (a document containing only text) *into* Python and *write* tabular data to a file *out of* Python. The function we use to do this depends on the file's format. For example, in the last chapter, we learned about using the `read_csv` function from `pandas` when reading `.csv` (**c**omma-**s**eparated **v**alues) files. In that case, the *separator* that divided our columns was a -comma (`,`). We only learned the case where the data matched the expected defaults -of the `read_csv` function -(column names are present, and commas are used as the separator between columns). -In this section, we will learn how to read +comma (`,`). We only learned the case where the data matched the expected defaults +of the `read_csv` function +(column names are present, and commas are used as the separator between columns). +In this section, we will learn how to read files that do not satisfy the default expectations of `read_csv`. ```{index} Canadian languages; canlang data ``` -Before we jump into the cases where the data aren't in the expected default format +Before we jump into the cases where the data aren't in the expected default format for `pandas` and `read_csv`, let's revisit the more straightforward case where the defaults hold, and the only argument we need to give to the function -is the path to the file, `data/can_lang.csv`. The `can_lang` data set contains -language data from the 2016 Canadian census. +is the path to the file, `data/can_lang.csv`. The `can_lang` data set contains +language data from the 2016 Canadian census. We put `data/` before the file's name when we are loading the data set because this data set is located in a sub-folder, named `data`, relative to where we are running our Python code. @@ -209,18 +209,19 @@ Non-Official & Non-Aboriginal languages,Amharic,22465,12785,200,33670 ```{index} pandas ``` -And here is a review of how we can use `read_csv` to load it into Python. First we +And here is a review of how we can use `read_csv` to load it into Python. First we load the `pandas` package to gain access to useful -functions for reading the data. +functions for reading the data. ```{code-cell} ipython3 -import pandas as pd +import pandas as pd ``` Next we use `read_csv` to load the data into Python, and in that call we specify the relative path to the file. ```{code-cell} ipython3 +:tags: ["output_scroll"] canlang_data = pd.read_csv("data/can_lang.csv") canlang_data ``` @@ -269,19 +270,20 @@ ParserError: Error tokenizing data. C error: Expected 1 fields in line 4, saw 6 ```{index} read function; skiprows argument ``` -To successfully read data like this into Python, the `skiprows` -argument can be useful to tell Python +To successfully read data like this into Python, the `skiprows` +argument can be useful to tell Python how many rows to skip before it should start reading in the data. In the example above, we would set this value to 3 to read and load the data correctly. ```{code-cell} ipython3 +:tags: ["output_scroll"] canlang_data = pd.read_csv("data/can_lang_meta-data.csv", skiprows=3) canlang_data ``` How did we know to skip three rows? We looked at the data! The first three rows -of the data had information we didn't need to import: +of the data had information we didn't need to import: ```code Data source: https://ttimbers.github.io/canlang/ @@ -289,13 +291,13 @@ Data originally published in: Statistics Canada Census of Population 2016. Reproduced and distributed on an as-is basis with their permission. ``` -The column names began at row 4, so we skipped the first three rows. +The column names began at row 4, so we skipped the first three rows. ### Using the `sep` argument for different separators Another common way data is stored is with tabs as the separator. Notice the data file, `can_lang.tsv`, has tabs in between the columns instead of -commas. +commas. ```code category language mother_tongue most_at_home most_at_work lang_known @@ -318,26 +320,27 @@ Non-Official & Non-Aboriginal languages Amharic 22465 12785 200 33670 ```{index} tsv, read function; read_tsv ``` -To read in `.tsv` (**t**ab **s**eparated **v**alues) files, we can set the `sep` argument +To read in `.tsv` (**t**ab **s**eparated **v**alues) files, we can set the `sep` argument in the `read_csv` function to the *tab character* `\t`. ```{index} escape character ``` -> **Note:** `\t` is an example of an *escaped character*, +> **Note:** `\t` is an example of an *escaped character*, > which always starts with a backslash (`\`). -> Escaped characters are used to represent non-printing characters +> Escaped characters are used to represent non-printing characters > (like the tab) or characters with special meanings (such as quotation marks). ```{code-cell} ipython3 +:tags: ["output_scroll"] canlang_data = pd.read_csv("data/can_lang.tsv", sep="\t") canlang_data ``` Let's compare the data frame here to the resulting data frame in Section {ref}`readcsv` after using `read_csv`. Notice anything? They look the same; they have -the same number of columns and rows, and have the same column names! +the same number of columns and rows, and have the same column names! So even though we needed to use different arguments depending on the file format, our resulting data frame (`canlang_data`) in both cases was the same. @@ -365,7 +368,7 @@ Non-Official & Non-Aboriginal languages Amharic 22465 12785 200 33670 ``` Data frames in Python need to have column names. Thus if you read in data that -don't have column names, Python will assign names automatically. In this example, +don't have column names, Python will assign names automatically. In this example, Python assigns each column a name of `0, 1, 2, 3, 4, 5`. To read this data into Python, we specify the first argument as the path to the file (as done with `read_csv`), and then provide @@ -374,9 +377,10 @@ and finally set `header = None` to tell `pandas` that the data file does not contain its own column names. ```{code-cell} ipython3 +:tags: ["output_scroll"] canlang_data = pd.read_csv( - "data/can_lang_no_cols.tsv", - sep = "\t", + "data/can_lang_no_cols.tsv", + sep = "\t", header = None ) canlang_data @@ -387,10 +391,10 @@ canlang_data It is best to rename your columns manually in this scenario. The current column names (`0, 1`, etc.) are problematic for two reasons: first, because they not very descriptive names, which will make your analysis -confusing; and second, because your column names should generally be *strings*, but are currently *integers*. +confusing; and second, because your column names should generally be *strings*, but are currently *integers*. To rename your columns, you can use the `rename` function -from the [pandas package](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rename.html#). -The argument of the `rename` function is `columns`, which takes a mapping between the old column names and the new column names. +from the [pandas package](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rename.html#). +The argument of the `rename` function is `columns`, which takes a mapping between the old column names and the new column names. In this case, we want to rename the old columns (`0, 1, ..., 5`) in the `canlang_data` data frame to more descriptive names. To specify the mapping, we create a *dictionary*: a Python object that represents @@ -400,6 +404,7 @@ Below, we create a dictionary called `col_map` that maps the old column names in names, and then pass it to the `rename` function. ```{code-cell} ipython3 +:tags: ["output_scroll"] col_map = { 0 : "category", 1 : "language", @@ -415,10 +420,11 @@ canlang_data_renamed ```{index} read function; names argument ``` -The column names can also be assigned to the data frame immediately upon reading it from the file by passing a -list of column names to the `names` argument in `read_csv`. +The column names can also be assigned to the data frame immediately upon reading it from the file by passing a +list of column names to the `names` argument in `read_csv`. ```{code-cell} ipython3 +:tags: ["output_scroll"] canlang_data = pd.read_csv( "data/can_lang_no_cols.tsv", sep="\t", @@ -448,6 +454,7 @@ path on our local computer. All other arguments that we use are the same as when using these functions with a local file on our computer. ```{code-cell} ipython3 +:tags: ["output_scroll"] url = "https://raw.githubusercontent.com/UBC-DSCI/introduction-to-datascience-python/reading/source/data/can_lang.csv" pd.read_csv(url) canlang_data = pd.read_csv(url) @@ -497,8 +504,8 @@ t 8f??3wn ?Pd(??J-?E???7?'t(?-GZ?????y???c~N?g[^_r?4 yG?O ?K??G? - - + + ]TUEe??O??c[???????6q??s??d?m???\???H?^????3} ?rZY? ?:L60?^?????XTP+?|? X?a??4VT?,D?Jq ``` @@ -509,11 +516,12 @@ X?a??4VT?,D?Jq This type of file representation allows Excel files to store additional things that you cannot store in a `.csv` file, such as fonts, text formatting, graphics, multiple sheets and more. And despite looking odd in a plain text -editor, we can read Excel spreadsheets into Python using the `pandas` package's `read_excel` -function developed specifically for this +editor, we can read Excel spreadsheets into Python using the `pandas` package's `read_excel` +function developed specifically for this purpose. ```{code-cell} ipython3 +:tags: ["output_scroll"] canlang_data = pd.read_excel("data/can_lang.xlsx") canlang_data ``` @@ -522,13 +530,13 @@ If the `.xlsx` file has multiple sheets, you have to use the `sheet_name` argume to specify the sheet number or name. This functionality is useful when a single sheet contains multiple tables (a sad thing that happens to many Excel spreadsheets since this makes reading in data more difficult). You can also specify cell ranges using the -`usecols` argument (e.g., `usecols="A:D"` for including columns from `A` to `D`). +`usecols` argument (e.g., `usecols="A:D"` for including columns from `A` to `D`). As with plain text files, you should always explore the data file before importing it into Python. Exploring the data beforehand helps you decide which arguments you need to load the data into Python successfully. If you do not have the Excel program on your computer, you can use other programs to preview the -file. Examples include Google Sheets and Libre Office. +file. Examples include Google Sheets and Libre Office. In {numref}`read_func` we summarize the `read_csv` and `read_excel` functions we covered in this chapter. We also include the arguments for data separated by @@ -547,20 +555,20 @@ European countries). * - Comma (`,`) separated files - `read_csv` - just the file path -* - Tab (`\t`) separated files +* - Tab (`\t`) separated files - `read_csv` - `sep="\t"` * - Missing header - `read_csv` - `header=None` * - European-style numbers, semicolon (`;`) separators - - `read_csv` + - `read_csv` - `sep=";"`, `thousands="."`, `decimal=","` * - Excel files (`.xlsx`) - `read_excel` - `sheet_name`, `usecols` - - + + ``` ## Reading data from a database @@ -576,7 +584,7 @@ different relational database management systems each have their own advantages and limitations. Almost all employ SQL (*structured query language*) to obtain data from the database. But you don't need to know SQL to analyze data from a database; several packages have been written that allow you to connect to -relational databases and use the Python programming language +relational databases and use the Python programming language to obtain data. In this book, we will give examples of how to do this using Python with SQLite and PostgreSQL databases. @@ -588,8 +596,8 @@ using Python with SQLite and PostgreSQL databases. SQLite is probably the simplest relational database system that one can use in combination with Python. SQLite databases are self-contained and usually stored and accessed locally on one computer. Data is usually stored in -a file with a `.db` extension (or sometimes a `.sqlite` extension). -Similar to Excel files, these are not plain text files and cannot be read in a plain text editor. +a file with a `.db` extension (or sometimes a `.sqlite` extension). +Similar to Excel files, these are not plain text files and cannot be read in a plain text editor. ```{index} database; connect, ibis, ibis; ibis ``` @@ -598,18 +606,18 @@ Similar to Excel files, these are not plain text files and cannot be read in a p ``` The first thing you need to do to read data into Python from a database is to -connect to the database. For an SQLite database, we will do that using +connect to the database. For an SQLite database, we will do that using the `connect` function from the `sqlite` backend in the `ibis` package. This command does not read in the data, but simply tells Python where the database is and opens up a communication channel that Python can use to send SQL commands to the database. -> **Note:** There is another database package in python called `sqlalchemy`. +> **Note:** There is another database package in python called `sqlalchemy`. > That package is a bit more mature than `ibis`, -> so if you want to dig deeper into working with databases in Python, that is a good next -> package to learn about. We will work with `ibis` in this book, as it -> provides a more modern and friendlier syntax that is more like `pandas` for data analysis code. +> so if you want to dig deeper into working with databases in Python, that is a good next +> package to learn about. We will work with `ibis` in this book, as it +> provides a more modern and friendlier syntax that is more like `pandas` for data analysis code. ```{code-cell} ipython3 import ibis @@ -621,7 +629,7 @@ conn = ibis.sqlite.connect("data/can_lang.db") ``` Often relational databases have many tables; thus, in order to retrieve -data from a database, you need to know the name of the table +data from a database, you need to know the name of the table in which the data is stored. You can get the names of all the tables in the database using the `list_tables` function: @@ -636,22 +644,22 @@ tables The `list_tables` function returned only one name---`"can_lang"`---which tells us that there is only one table in this database. To reference a table in the -database (so that we can perform operations like selecting columns and filtering rows), we +database (so that we can perform operations like selecting columns and filtering rows), we use the `table` function from the `conn` object. The object returned by the `table` function allows us to work with data stored in databases as if they were just regular `pandas` data frames; but secretly, behind -the scenes, `ibis` will turn your commands into SQL queries! +the scenes, `ibis` will turn your commands into SQL queries! ```{code-cell} ipython3 canlang_table = conn.table("can_lang") -canlang_table +canlang_table ``` ```{index} database; count, ibis; count ``` Although it looks like we might have obtained the whole data frame from the database, we didn't! -It's a *reference*; the data is still stored only in the SQLite database. The `canlang_table` object +It's a *reference*; the data is still stored only in the SQLite database. The `canlang_table` object is an `AlchemyTable` (`ibis` is using `sqlalchemy` under the hood!), which, when printed, tells you which columns are available in the table. But unlike a usual `pandas` data frame, we do not immediately know how many rows are in the table. In order to find out how many @@ -665,7 +673,7 @@ canlang_table.count() ```{index} execute, ibis; execute ``` -Wait a second...this isn't the number of rows in the database. In fact, we haven't actually sent our +Wait a second...this isn't the number of rows in the database. In fact, we haven't actually sent our SQL query to the database yet! We need to explicitly tell `ibis` when we want to send the query. The reason for this is that databases are often more efficient at working with (i.e., selecting, filtering, joining, etc.) large data sets than Python. And typically, the database will not even @@ -693,23 +701,24 @@ str(canlang_table.count().compile()) The output above shows the SQL code that is sent to the database. When we write `canlang_table.count().execute()` in Python, in the background, the `execute` function is translating the Python code into SQL, sending that SQL to the database, and then translating the -response for us. So `ibis` does all the hard work of translating from Python to SQL and back for us; -we can just stick with Python! +response for us. So `ibis` does all the hard work of translating from Python to SQL and back for us; +we can just stick with Python! The `ibis` package provides lots of `pandas`-like tools for working with database tables. -For example, we can look at the first few rows of the table by using the `head` function---and +For example, we can look at the first few rows of the table by using the `head` function---and we won't forget to `execute` to see the result! ```{index} database; head, ibis; ``` ```{code-cell} ipython3 +:tags: ["output_scroll"] canlang_table.head(10).execute() ``` You can see that `ibis` actually returned a `pandas` data frame to us after we executed the query, which is very convenient for working with the data after getting it from the database. -So now that we have the `canlang_table` table reference for the 2016 Canadian Census data in hand, we +So now that we have the `canlang_table` table reference for the 2016 Canadian Census data in hand, we can mostly continue onward as if it were a regular data frame. For example, let's do the same exercise from Chapter 1: we will obtain only those rows corresponding to Aboriginal languages, and keep only the `language` and `mother_tongue` columns. @@ -723,7 +732,7 @@ to obtain only certain rows. Below we filter the data to include only Aboriginal canlang_table_filtered = canlang_table[canlang_table["category"] == "Aboriginal languages"] canlang_table_filtered ``` -Above you can see that we have not yet executed this command; `canlang_table_filtered` is just showing +Above you can see that we have not yet executed this command; `canlang_table_filtered` is just showing the first part of our query (the part that starts with `Selection[r0]` above). We didn't call `execute` because we are not ready to bring the data into Python yet. We can still use the database to do some work to obtain *only* the small amount of data we want to work with locally @@ -746,7 +755,7 @@ aboriginal_lang_data `ibis` provides many more functions (not just the `[]` operation) that you can use to manipulate the data within the database before calling -`execute` to obtain the data in Python. But `ibis` does not provide *every* function +`execute` to obtain the data in Python. But `ibis` does not provide *every* function that we need for analysis; we do eventually need to call `execute`. For example, `ibis` does not provide the `tail` function to look at the last rows in a database, even though `pandas` does. @@ -755,6 +764,7 @@ rows in a database, even though `pandas` does. ``` ```{code-cell} ipython3 +:tags: ["output_scroll"] canlang_table_selected.tail(6) ``` @@ -768,14 +778,14 @@ But be very careful using `execute`: databases are often *very* big, and reading an entire table into Python might take a long time to run or even possibly crash your machine. So make sure you select and filter the database table to reduce the data to a reasonable size before using `execute` to read it into Python! - -### Reading data from a PostgreSQL database + +### Reading data from a PostgreSQL database ```{index} database; PostgreSQL ``` PostgreSQL (also called Postgres) is a very popular -and open-source option for relational database software. +and open-source option for relational database software. Unlike SQLite, PostgreSQL uses a client–server database engine, as it was designed to be used and accessed on a network. This means that you have to provide more information @@ -790,13 +800,13 @@ need to include when you call the `connect` function is listed below: Below we demonstrate how to connect to a version of the `can_mov_db` database, which contains information about Canadian movies. -Note that the `host` (`fakeserver.stat.ubc.ca`), `user` (`user0001`), and -`password` (`abc123`) below are *not real*; you will not actually +Note that the `host` (`fakeserver.stat.ubc.ca`), `user` (`user0001`), and +`password` (`abc123`) below are *not real*; you will not actually be able to connect to a database using this information. ```python conn = ibis.postgres.connect( - database = "can_mov_db", + database = "can_mov_db", host = "fakeserver.stat.ubc.ca", port = 5432, user = "user0001", @@ -819,7 +829,7 @@ conn.list_tables() We see that there are 10 tables in this database. Let's first look at the `"ratings"` table to find the lowest rating that exists in the `can_mov_db` -database. +database. ```python ratings_table = conn.table("ratings") @@ -887,18 +897,18 @@ then use `ibis` to translate `pandas`-like commands (the `[]` operation, `head`, etc.) into SQL queries that the database understands, and then finally `execute` them. And not all `pandas` commands can currently be translated via `ibis` into database queries. So you might be wondering: why should we use -databases at all? +databases at all? Databases are beneficial in a large-scale setting: - They enable storing large data sets across multiple computers with backups. - They provide mechanisms for ensuring data integrity and validating input. - They provide security and data access control. -- They allow multiple users to access data simultaneously +- They allow multiple users to access data simultaneously and remotely without conflicts and errors. - For example, there are billions of Google searches conducted daily in 2021 {cite:p}`googlesearches`. - Can you imagine if Google stored all of the data - from those searches in a single `.csv` file!? Chaos would ensue! + For example, there are billions of Google searches conducted daily in 2021 {cite:p}`googlesearches`. + Can you imagine if Google stored all of the data + from those searches in a single `.csv` file!? Chaos would ensue! ## Writing data from Python to a `.csv` file @@ -910,7 +920,7 @@ that has changed (through selecting columns, filtering rows, etc.) to a file to share it with others or use it for another step in the analysis. The most straightforward way to do this is to use the `to_csv` function from the `pandas` package. The default -arguments are to use a comma (`,`) as the separator, and to include column names +arguments are to use a comma (`,`) as the separator, and to include column names in the first row. We also specify `index = False` to tell `pandas` not to print row numbers in the `.csv` file. Below we demonstrate creating a new version of the Canadian languages data set without the "Official languages" category according to the @@ -921,18 +931,18 @@ no_official_lang_data = canlang_data[canlang_data["category"] != "Official langu no_official_lang_data.to_csv("data/no_official_languages.csv", index=False) ``` -% ## Obtaining data from the web -% +% ## Obtaining data from the web +% % > **Note:** This section is not required reading for the remainder of the textbook. It % > is included for those readers interested in learning a little bit more about % > how to obtain different types of data from the web. -% +% % ```{index} see: application programming interface; API % ``` -% +% % ```{index} API % ``` -% +% % Data doesn't just magically appear on your computer; you need to get it from % somewhere. Earlier in the chapter we showed you how to access data stored in a % plain text, spreadsheet-like format (e.g., comma- or tab-separated) from a web @@ -946,16 +956,16 @@ no_official_lang_data.to_csv("data/no_official_languages.csv", index=False) % data they have access to, and *how much* data they can access. Typically, the % website owner will give you a *token* (a secret string of characters somewhat % like a password) that you have to provide when accessing the API. -% +% % ```{index} web scraping, CSS, HTML % ``` -% +% % ```{index} see: hypertext markup language; HTML % ``` -% +% % ```{index} see: cascading style sheet; CSS % ``` -% +% % Another interesting thought: websites themselves *are* data! When you type a % URL into your browser window, your browser asks the *web server* (another % computer on the internet whose job it is to respond to requests for the @@ -963,117 +973,117 @@ no_official_lang_data.to_csv("data/no_official_languages.csv", index=False) % data into something you can see. If the website shows you some information that % you're interested in, you could *create* a data set for yourself by copying and % pasting that information into a file. This process of taking information -% directly from what a website displays is called +% directly from what a website displays is called % *web scraping* (or sometimes *screen scraping*). Now, of course, copying and pasting % information manually is a painstaking and error-prone process, especially when % there is a lot of information to gather. So instead of asking your browser to % translate the information that the web server provides into something you can % see, you can collect that data programmatically—in the form of -% **h**yper**t**ext **m**arkup **l**anguage -% (HTML) -% and **c**ascading **s**tyle **s**heet (CSS) code—and process it +% **h**yper**t**ext **m**arkup **l**anguage +% (HTML) +% and **c**ascading **s**tyle **s**heet (CSS) code—and process it % to extract useful information. HTML provides the % basic structure of a site and tells the webpage how to display the content % (e.g., titles, paragraphs, bullet lists etc.), whereas CSS helps style the -% content and tells the webpage how the HTML elements should -% be presented (e.g., colors, layouts, fonts etc.). -% +% content and tells the webpage how the HTML elements should +% be presented (e.g., colors, layouts, fonts etc.). +% % This subsection will show you the basics of both web scraping % with the [`BeautifulSoup` Python package](https://beautiful-soup-4.readthedocs.io/en/latest/) {cite:p}`beautifulsoup` % and accessing the Twitter API % using the [`tweepy` Python package](https://github.com/tweepy/tweepy) {cite:p}`tweepy`. -% +% % +++ -% +% % ### Web scraping -% +% % #### HTML and CSS selectors -% +% % ```{index} web scraping, HTML; selector, CSS; selector, Craiglist % ``` -% +% % When you enter a URL into your browser, your browser connects to the % web server at that URL and asks for the *source code* for the website. -% This is the data that the browser translates +% This is the data that the browser translates % into something you can see; so if we % are going to create our own data by scraping a website, we have to first understand % what that data looks like! For example, let's say we are interested % in knowing the average rental price (per square foot) of the most recently -% available one-bedroom apartments in Vancouver +% available one-bedroom apartments in Vancouver % on [Craiglist](https://vancouver.craigslist.org). When we visit the Vancouver Craigslist -% website and search for one-bedroom apartments, +% website and search for one-bedroom apartments, % we should see something similar to {numref}`fig:craigslist-human`. -% +% % +++ -% +% % ```{figure} img/craigslist_human.png % :name: fig:craigslist-human -% +% % Craigslist webpage of advertisements for one-bedroom apartments. % ``` -% +% % +++ -% +% % Based on what our browser shows us, it's pretty easy to find the size and price % for each apartment listed. But we would like to be able to obtain that information % using Python, without any manual human effort or copying and pasting. We do this by % examining the *source code* that the web server actually sent our browser to -% display for us. We show a snippet of it below; the -% entire source +% display for us. We show a snippet of it below; the +% entire source % is [included with the code for this book](https://github.com/UBC-DSCI/introduction-to-datascience-python/blob/main/source/img/website_source.txt): -% +% % ```html % % $800 -% +% % % 1br - % -% +% % (13768 108th Avenue) -% +% % % map % -% +% % % hide this posting % -% +% % % restore % restore this posting % -% +% % %

% %
  • -% +% % $2285 % % ``` -% +% % Oof...you can tell that the source code for a web page is not really designed % for humans to understand easily. However, if you look through it closely, you % will find that the information we're interested in is hidden among the muck. % For example, near the top of the snippet % above you can see a line that looks like -% +% % ```html % $800 % ``` -% +% % That is definitely storing the price of a particular apartment. With some more % investigation, you should be able to find things like the date and time of the % listing, the address of the listing, and more. So this source code most likely % contains all the information we are interested in! -% +% % ```{index} HTML; tag % ``` -% +% % Let's dig into that line above a bit more. You can see that % that bit of code has an *opening tag* (words between `<` and `>`, like % ``) and a *closing tag* (the same with a slash, like ``). HTML @@ -1087,86 +1097,86 @@ no_official_lang_data.to_csv("data/no_official_languages.csv", index=False) % apartment prices, maybe we can look for all the tags with the `"result-price"` % class, and grab the information between the opening and closing tag. Indeed, % take a look at another line of the source snippet above: -% +% % ```html % $2285 % ``` -% +% % It's yet another price for an apartment listing, and the tags surrounding it % have the `"result-price"` class. Wonderful! Now that we know what pattern we % are looking for—a dollar amount between opening and closing tags that have the -% `"result-price"` class—we should be able to use code to pull out all of the +% `"result-price"` class—we should be able to use code to pull out all of the % matching patterns from the source code to obtain our data. This sort of "pattern" % is known as a *CSS selector* (where CSS stands for **c**ascading **s**tyle **s**heet). -% -% The above was a simple example of "finding the pattern to look for"; many +% +% The above was a simple example of "finding the pattern to look for"; many % websites are quite a bit larger and more complex, and so is their website % source code. Fortunately, there are tools available to make this process -% easier. For example, -% [SelectorGadget](https://selectorgadget.com/) is -% an open-source tool that simplifies identifying the generating -% and finding of CSS selectors. +% easier. For example, +% [SelectorGadget](https://selectorgadget.com/) is +% an open-source tool that simplifies identifying the generating +% and finding of CSS selectors. % At the end of the chapter in the additional resources section, we include a link to -% a short video on how to install and use the SelectorGadget tool to -% obtain CSS selectors for use in web scraping. -% After installing and enabling the tool, you can click the -% website element for which you want an appropriate selector. For +% a short video on how to install and use the SelectorGadget tool to +% obtain CSS selectors for use in web scraping. +% After installing and enabling the tool, you can click the +% website element for which you want an appropriate selector. For % example, if we click the price of an apartment listing, we % find that SelectorGadget shows us the selector `.result-price` % in its toolbar, and highlights all the other apartment % prices that would be obtained using that selector ({numref}`fig:sg1`). -% +% % ```{figure} img/sg1.png % :name: fig:sg1 -% +% % Using the SelectorGadget on a Craigslist webpage to obtain the CCS selector useful for obtaining apartment prices. % ``` -% +% % If we then click the size of an apartment listing, SelectorGadget shows us % the `span` selector, and highlights many of the lines on the page; this indicates that the -% `span` selector is not specific enough to capture only apartment sizes ({numref}`fig:sg3`). -% +% `span` selector is not specific enough to capture only apartment sizes ({numref}`fig:sg3`). +% % ```{figure} img/sg3.png % :name: fig:sg3 -% +% % Using the SelectorGadget on a Craigslist webpage to obtain a CCS selector useful for obtaining apartment sizes. % ``` -% +% % To narrow the selector, we can click one of the highlighted elements that -% we *do not* want. For example, we can deselect the "pic/map" links, +% we *do not* want. For example, we can deselect the "pic/map" links, % resulting in only the data we want highlighted using the `.housing` selector ({numref}`fig:sg2`). -% +% % ```{figure} img/sg2.png % :name: fig:sg2 -% +% % Using the SelectorGadget on a Craigslist webpage to refine the CCS selector to one that is most useful for obtaining apartment sizes. % ``` -% +% % So to scrape information about the square footage and rental price % of apartment listings, we need to use % the two CSS selectors `.housing` and `.result-price`, respectively. % The selector gadget returns them to us as a comma-separated list (here % `.housing , .result-price`), which is exactly the format we need to provide to % Python if we are using more than one CSS selector. -% +% % **Stop! Are you allowed to scrape that website?** -% +% % ```{index} web scraping; permission % ``` -% +% % +++ -% +% % *Before* scraping data from the web, you should always check whether or not % you are *allowed* to scrape it! There are two documents that are important % for this: the `robots.txt` file and the Terms of Service % document. If we take a look at [Craigslist's Terms of Service document](https://www.craigslist.org/about/terms.of.use), -% we find the following text: *"You agree not to copy/collect CL content +% we find the following text: *"You agree not to copy/collect CL content % via robots, spiders, scripts, scrapers, crawlers, or any automated or manual equivalent (e.g., by hand)."* % So unfortunately, without explicit permission, we are not allowed to scrape the website. -% +% % ```{index} Wikipedia % ``` -% +% % What to do now? Well, we *could* ask the owner of Craigslist for permission to scrape. % However, we are not likely to get a response, and even if we did they would not likely give us permission. % The more realistic answer is that we simply cannot scrape Craigslist. If we still want @@ -1174,122 +1184,122 @@ no_official_lang_data.to_csv("data/no_official_languages.csv", index=False) % To continue learning how to scrape data from the web, let's instead % scrape data on the population of Canadian cities from Wikipedia. % We have checked the [Terms of Service document](https://foundation.wikimedia.org/wiki/Terms_of_Use/en), -% and it does not mention that web scraping is disallowed. +% and it does not mention that web scraping is disallowed. % We will use the SelectorGadget tool to pick elements that we are interested in -% (city names and population counts) and deselect others to indicate that we are not +% (city names and population counts) and deselect others to indicate that we are not % interested in them (province names), as shown in {numref}`fig:sg4`. -% +% % ```{figure} img/selectorgadget-wiki-updated.png % :name: fig:sg4 -% +% % Using the SelectorGadget on a Wikipedia webpage. % ``` -% +% % We include a link to a short video tutorial on this process at the end of the chapter % in the additional resources section. SelectorGadget provides in its toolbar % the following list of CSS selectors to use: -% +% % +++ -% +% % ```code -% td:nth-child(8) , -% td:nth-child(6) , -% td:nth-child(4) , +% td:nth-child(8) , +% td:nth-child(6) , +% td:nth-child(4) , % .mw-parser-output div tr+ tr td:nth-child(2) % ``` -% +% % +++ -% +% % Now that we have the CSS selectors that describe the properties of the elements % that we want to target (e.g., has a tag name `price`), we can use them to find % certain elements in web pages and extract data. -% +% % +++ -% +% % **Using `pandas.read_html`** -% +% % +++ -% +% % The easiest way to read a table from HTML is to use [`pandas.read_html`](https://pandas.pydata.org/docs/reference/api/pandas.read_html.html). We can see that the Wikipedia page of "Canada" has 18 tables. -% +% % ```{code-cell} ipython3 % :tags: [remove-output] -% +% % canada_wiki = pd.read_html("https://en.wikipedia.org/wiki/Canada") % len(canada_wiki) % ``` -% +% % ``` % 18 % ``` -% +% % +++ -% +% % With some inspection, we find that the table that shows the population of the most populated provinces is of index 1. -% +% % ```{code-cell} ipython3 % :tags: [remove-output] -% +% % df = canada_wiki[1] % df.columns = df.columns.droplevel() % df % ``` -% +% % ```{code-cell} ipython3 % :tags: [remove-input] -% +% % df = pd.read_csv("data/canada-wiki-read_html.csv", index_col=0) % df % ``` -% +% % **Using `BeautifulSoup`** -% +% % ```{index} BeautifulSoup, requests % ``` -% +% % Now that we have our CSS selectors we can use the `requests` and `BeautifulSoup` Python packages to scrape our desired data from the website. We start by loading the packages: -% +% % ```{code-cell} ipython3 % import requests % from bs4 import BeautifulSoup % ``` -% +% % Next, we tell Python what page we want to scrape by providing the webpage's URL in quotations to the function `requests.get` and pass it into the `BeautifulSoup` function for parsing: -% +% % ```{code-cell} ipython3 % wiki = requests.get("https://en.wikipedia.org/wiki/Canada") % page = BeautifulSoup(wiki.content, "html.parser") % ``` -% +% % The `requests.get` function sends a `GET` request to the specified URL and returns the server's response to the HTTP request (*i.e.* a `requests.Response` object). The `BeautifulSoup` function takes the content of the response and returns the HTML source code itself, which we have % stored in the `page` variable. Next, we use the `select` method of the page object along with the CSS selectors we obtained from the SelectorGadget tool. Make sure to surround the selectors with quotation marks; `select` expects that -% argument is a string. It selects *nodes* from the HTML document that +% argument is a string. It selects *nodes* from the HTML document that % match the CSS selectors you specified. A *node* is an HTML tag pair (e.g., % `` and `` which defines the cell of a table) combined with the content % stored between the tags. For our CSS selector `td:nth-child(6)`, an example % node that would be selected would be: -% +% % +++ -% +% % ``` % % London % % ``` -% +% % +++ -% +% % We store the result of the `select` function in the `population_nodes` variable. Note that it returns a list, and we slice the list to only print the first 5 elements. -% +% % ```{code-cell} ipython3 % :tags: [remove-output] -% +% % population_nodes = page.select( % "td:nth-child(8) , td:nth-child(6) , td:nth-child(4) , .mw-parser-output div td:nth-child(2)" % ) % population_nodes[:5] % ``` -% +% % ``` % [Toronto, % 6,202,225, @@ -1298,27 +1308,27 @@ no_official_lang_data.to_csv("data/no_official_languages.csv", index=False) % , % Montreal] % ``` -% +% % +++ -% -% Next we extract the meaningful data—in other words, we get rid of the HTML code syntax and tags—from +% +% Next we extract the meaningful data—in other words, we get rid of the HTML code syntax and tags—from % the nodes using the `get_text` % function. In the case of the example % node above, `get_text` function returns `"London"`. -% +% % ```{code-cell} ipython3 % :tags: [remove-output] -% +% % [row.get_text() for row in population_nodes][:5] % ``` -% +% % ``` % ['Toronto', '6,202,225', 'London', '543,551\n', 'Montreal'] % ``` -% +% % +++ -% -% Fantastic! We seem to have extracted the data of interest from the +% +% Fantastic! We seem to have extracted the data of interest from the % raw HTML source code. But we are not quite done; the data % is not yet in an optimal format for data analysis. Both the city names and % population are encoded as characters in a single vector, instead of being in a @@ -1328,14 +1338,14 @@ no_official_lang_data.to_csv("data/no_official_languages.csv", index=False) % dealing with numbers), and some even contain a line break character at the end % (`\n`). In Chapter {ref}`wrangling`, we will learn more about how to *wrangle* data % such as this into a more useful format for data analysis using Python. -% +% % +++ -% +% % ### Using an API -% +% % ```{index} API % ``` -% +% % Rather than posting a data file at a URL for you to download, many websites these days % provide an API that must be accessed through a programming language like Python. The benefit of this % is that data owners have much more control over the data they provide to users. However, unlike @@ -1343,87 +1353,87 @@ no_official_lang_data.to_csv("data/no_official_languages.csv", index=False) % has its own API designed especially for its own use case. Therefore we will just provide one example % of accessing data through an API in this book, with the hope that it gives you enough of a basic % idea that you can learn how to use another API if needed. -% +% % ```{index} API; tweepy, tweepy, Twitter, API; token % ``` -% +% % +++ -% +% % In particular, in this book we will show you the basics of how to use % the `tweepy` package in Python to access % data from the Twitter API. `tweepy` requires the [Twitter Developer Portal](https://developer.twitter.com/en/portal/dashboard) and you will need to get tokens and secrets from that, through which your access to the data will then be authenticated and controlled. -% +% % +++ -% +% % First, we go to the [Twitter Developer Portal](https://developer.twitter.com/en/portal/dashboard) and sign up an account if you do not have one yet. Note that you will need a valid phone number to associate with your developer account. After filling out the basic information, we will get the *essential access* to the Twitter API. Then we can create an app and hit the "get key" button, and we will get the API key and API key secret of the app (along with the bearer token which will not be used in this demonstration). **We need to store the key and secret at a safe place, and make sure do not show them to anyone else (also do not accidentally push it to the GitHub repository).** If you lose the key, you can always regenerate it. Next, we go to the "Keys and tokens" tab of the app, and generate an access token and an access token secret. **Save the access token and the access token secret at a safe place as well.** Your app will look something like {numref}`fig:twitter-API-keys-tokens`. -% +% % +++ -% +% % ```{figure} img/twitter-API-keys-tokens.png % :name: fig:twitter-API-keys-tokens -% -% Generating the API key-secret pair and the access token-secret pair in Twitter API. +% +% Generating the API key-secret pair and the access token-secret pair in Twitter API. % ``` -% +% % +++ -% +% % Once you get the access keys and secrets, you can follow along with the examples that we show here. % To get started, load the `tweepy` package and authenticate our access to the Twitter developer portal account. -% +% % ```{code-cell} ipython3 % :tags: [remove-output] -% +% % import tweepy -% +% % # replace these with the api key, api key secret, access token and access token secret % # generated on your own -% api_key = "8OxHWiIWjy8M39LvnC8OfSXrj" +% api_key = "8OxHWiIWjy8M39LvnC8OfSXrj" % api_key_secret = "scqjRqX5stoy4pYB5Zu52tCBKzhGLDh5nRqTEM6CMoLRkRLR8F" -% +% % access_token = "1556029189484007425-mYwaDCI1WnCxjuMt0jb2UYD2ns8BYB" % access_token_secret = "pDG4Ta7giYLY3mablPhd6y9bB5y2Aer1Cn18rihIJFBB7" -% +% % # Authenticate to Twitter % auth = tweepy.OAuthHandler(api_key, api_key_secret) % auth.set_access_token(access_token, access_token_secret) -% +% % api = tweepy.API(auth) -% +% % try: % api.verify_credentials() % print("Successful Authentication") % except: % print("Failed authentication") % ``` -% +% % ``` % Successful Authentication % ``` -% +% % +++ -% -% `tweepy` provides an extensive set of functions to search -% Twitter for tweets, users, their followers, and more. -% Let's construct a small data set of the last 200 tweets and +% +% `tweepy` provides an extensive set of functions to search +% Twitter for tweets, users, their followers, and more. +% Let's construct a small data set of the last 200 tweets and % retweets from the [@scikit_learn](https://twitter.com/scikit_learn) account. A few of the most recent tweets % are shown in {numref}`fig:01-scikit-learn-twitter`. -% +% % +++ -% +% % ```{figure} img/scikit-learn-twitter.png % :name: fig:01-scikit-learn-twitter -% +% % The `scikit-learn` account Twitter feed. % ``` -% +% % +++ -% +% % **Stop! Think about your API usage carefully!** -% +% % When you access an API, you are initiating a transfer of data from a web server % to your computer. Web servers are expensive to run and do not have infinite resources. -% If you try to ask for *too much data* at once, you can use up a huge amount of the server's bandwidth. -% If you try to ask for data *too frequently*—e.g., if you +% If you try to ask for *too much data* at once, you can use up a huge amount of the server's bandwidth. +% If you try to ask for data *too frequently*—e.g., if you % make many requests to the server in quick succession—you can also bog the server down and make % it unable to talk to anyone else. Most servers have mechanisms to revoke your access if you are not % careful, but you should try to prevent issues from happening in the first place by being extra careful @@ -1432,19 +1442,19 @@ no_official_lang_data.to_csv("data/no_official_languages.csv", index=False) % Be careful not to overrun your quota! In this example, we should take a look at % [the Twitter website](https://developer.twitter.com/en/docs/twitter-api/rate-limits) to see what limits % we should abide by when using the API. -% +% % +++ -% +% % **Using `tweepy`** -% +% % After checking the Twitter website, it seems like asking for 200 tweets one time is acceptable. % So we can use the `user_timeline` function to ask for the last 200 tweets from the [@scikit_learn](https://twitter.com/scikit_learn) account. -% +% % ```{code-cell} ipython3 % :tags: [remove-output] -% +% % userID = "scikit_learn" -% +% % scikit_learn_tweets = api.user_timeline( % screen_name=userID, % count=200, @@ -1452,69 +1462,69 @@ no_official_lang_data.to_csv("data/no_official_languages.csv", index=False) % tweet_mode="extended", % ) % ``` -% +% % Let's take a look at the first 3 most recent tweets of [@scikit_learn](https://twitter.com/scikit_learn) through accessing the attributes of tweet data dictionary: -% +% % ```{code-cell} ipython3 % :tags: [remove-output] -% +% % for info in scikit_learn_tweets[:3]: % print("ID: {}".format(info.id)) % print(info.created_at) % print(info.full_text) % print("\n") % ``` -% +% % ``` % ID: 1555686128971403265 % 2022-08-05 22:44:11+00:00 % scikit-learn 1.1.2 is out on https://t.co/lSpi4eDc2t and conda-forge! -% +% % This is a small maintenance release that fixes a couple of regressions: % https://t.co/Oa84ES0qpG -% -% +% +% % ID: 1549321048943988737 % 2022-07-19 09:11:37+00:00 % RT @MarenWestermann: @scikit_learn It is worth highlighting that this scikit-learn sprint is seeing the highest participation of women out… -% -% +% +% % ID: 1548339716465930244 % 2022-07-16 16:12:09+00:00 % @StefanieMolin @theBodlina @RichardKlima We continue pulling requests here in Dublin. Putting some Made in Ireland code in the scikit-learn codebase šŸ‡®šŸ‡Ŗ . Current stats: 18 PRs opened, 12 merged šŸš€ https://t.co/ccWy8vh8YI % ``` -% +% % +++ -% +% % A full list of available attributes provided by Twitter API can be found [here](https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/object-model/tweet). -% +% % +++ -% +% % For the demonstration purpose, let's only use a % few variables of interest: `created_at`, `user.screen_name`, `retweeted`, % and `full_text`, and construct a `pandas` DataFrame using the extracted information. -% +% % ```{code-cell} ipython3 % :tags: [remove-output] -% +% % columns = ["time", "user", "is_retweet", "text"] % data = [] % for tweet in scikit_learn_tweets: % data.append( % [tweet.created_at, tweet.user.screen_name, tweet.retweeted, tweet.full_text] % ) -% +% % scikit_learn_tweets_df = pd.DataFrame(data, columns=columns) % scikit_learn_tweets_df % ``` -% +% % ```{code-cell} ipython3 % :tags: [remove-input] -% +% % scikit_learn_tweets_df = pd.read_csv("data/reading_api_df.csv", index_col=0) % scikit_learn_tweets_df % ``` -% +% % If you look back up at the image of the [@scikit_learn](https://twitter.com/scikit_learn) Twitter page, you will % recognize the text of the most recent few tweets in the above data frame. In % other words, we have successfully created a small data set using the Twitter @@ -1522,21 +1532,21 @@ no_official_lang_data.to_csv("data/no_official_languages.csv", index=False) % the extracted information can be easily converted into a `pandas` data frame (although not *every* API will provide data in such a nice format). % From this point onward, the `scikit_learn_tweets_df` data frame is stored on your % machine, and you can play with it to your heart's content. For example, you can use -% `pandas.to_csv` to save it to a file and `pandas.read_csv` to read it into Python again later; +% `pandas.to_csv` to save it to a file and `pandas.read_csv` to read it into Python again later; % and after reading the next few chapters you will have the skills to % compute the percentage of retweets versus tweets, find the most oft-retweeted -% account, make visualizations of the data, and much more! If you decide that you want -% to ask the Twitter API for more data +% account, make visualizations of the data, and much more! If you decide that you want +% to ask the Twitter API for more data % (see [the `tweepy` page](https://github.com/tweepy/tweepy) % for more examples of what is possible), just be mindful as usual about how much % data you are requesting and how frequently you are making requests. -% +% % +++ ## Exercises -Practice exercises for the material covered in this chapter -can be found in the accompanying +Practice exercises for the material covered in this chapter +can be found in the accompanying [worksheets repository](https://github.com/UBC-DSCI/data-science-a-first-intro-python-worksheets#readme) in the "Reading in data locally and from the web" row. You can launch an interactive version of the worksheet in your browser by clicking the "launch binder" button. @@ -1548,7 +1558,7 @@ and guidance that the worksheets provide will function as intended. ## Additional resources -- The [`pandas` documentation](https://pandas.pydata.org/docs/getting_started/index.html) +- The [`pandas` documentation](https://pandas.pydata.org/docs/getting_started/index.html) provides the documentation for many of the reading functions we cover in this chapter. It is where you should look if you want to learn more about the functions in this chapter, the full set of arguments you can use, and other related functions. diff --git a/source/references.md b/source/references.md index c25e6545..942a1a56 100644 --- a/source/references.md +++ b/source/references.md @@ -13,6 +13,6 @@ kernelspec: name: python3 --- -`r if (knitr:::is_html_output()) ' -# References -- TBD -'` +# References + + diff --git a/source/viz.md b/source/viz.md index b9c6c0bc..5522124e 100644 --- a/source/viz.md +++ b/source/viz.md @@ -12,38 +12,55 @@ kernelspec: name: python3 --- +```{code-cell} ipython3 +:tags: [remove-cell] + +# ignore warnings from altair + +import warnings +def warn(*args, **kwargs): + pass +warnings.warn = warn +``` + + + (viz)= # Effective data visualization -## Overview +## Overview This chapter will introduce concepts and tools relating to data visualization beyond what we have seen and practiced so far. We will focus on guiding principles for effective data visualization and explaining visualizations independent of any particular tool or programming language. In the process, we will cover some specifics of creating visualizations (scatter plots, bar -plots, line plots, and histograms) for data using Python. +plots, line plots, and histograms) for data using Python. ## Chapter learning objectives By the end of the chapter, readers will be able to do the following: - +- Describe when to use the following kinds of visualizations to answer specific questions using a data set: + - scatter plots + - line plots + - bar plots + - histogram plots - Given a data set and a question, select from the above plot types and use Python to create a visualization that best answers the question. - Given a visualization and a question, evaluate the effectiveness of the visualization and suggest improvements to better answer the question. - Referring to the visualization, communicate the conclusions in non-technical terms. -- Identify rules of thumb for creating effective visualizations. +- Identify rules of thumb for creating effective visualizations. - Define the two key aspects of altair objects: - mark objects - encodings - Use the altair library in Python to create and refine the above visualizations using: - - mark objects: mark_point, mark_line, mark_bar - - encodings : x, y, fill, color, shape - - subplots: facet + - mark objects: `mark_point`, `mark_line`, `mark_bar` + - encodings : `x`, `y`, `fill`, `color`, `shape` + - subplots: `facet` - Describe the difference in raster and vector output formats. - Use `chart.save()` to save visualizations in `.png` and `.svg` format. ## Choosing the visualization -#### *Ask a question, and answer it* {-} +#### *Ask a question, and answer it* ```{index} question; visualization ``` @@ -58,22 +75,22 @@ Imagine your visualization as part of a poster presentation for a project; even if you aren't standing at the poster explaining things, an effective visualization will convey your message to the audience. -Recall the different data analysis questions -from Chapter \@ref(intro). -With the visualizations we will cover in this chapter, -we will be able to answer *only descriptive and exploratory* questions. -Be careful to not answer any *predictive, inferential, causal* -*or mechanistic* questions with the visualizations presented here, -as we have not learned the tools necessary to do that properly just yet. +Recall the different data analysis questions +from the {ref}`intro` chapter. +With the visualizations we will cover in this chapter, +we will be able to answer *only descriptive and exploratory* questions. +Be careful to not answer any *predictive, inferential, causal* +*or mechanistic* questions with the visualizations presented here, +as we have not learned the tools necessary to do that properly just yet. As with most coding tasks, it is totally fine (and quite common) to make mistakes and iterate a few times before you find the right visualization for your data and question. There are many different kinds of plotting -graphics available to use (see Chapter 5 of *Fundamentals of Data Visualization* {cite:p}`wilkeviz` for a directory). -The types of plot that we introduce in this book are shown in {numref}`plot_sketches` -which one you should select depends on your data -and the question you want to answer. -In general, the guiding principles of when to use each type of plot +graphics available to use (see Chapter 5 of *Fundamentals of Data Visualization* {cite:p}`wilkeviz` for a directory). +The types of plot that we introduce in this book are shown in {numref}`plot_sketches`; +which one you should select depends on your data +and the question you want to answer. +In general, the guiding principles of when to use each type of plot are as follows: ```{index} visualization; line, visualization; histogram, visualization; scatter, visualization; bar, distribution @@ -106,13 +123,13 @@ alternative. +++ ## Refining the visualization -#### *Convey the message, minimize noise* {-} +#### *Convey the message, minimize noise* Just being able to make a visualization in Python with `altair` (or any other tool for that matter) doesn't mean that it effectively communicates your message to others. Once you have selected a broad type of visualization to use, you will have to refine it to suit your particular need. Some rules of thumb for doing -this are listed below. They generally fall into two classes: you want to +this are listed below. They generally fall into two classes: you want to *make your visualization convey your message*, and you want to *reduce visual noise* as much as possible. Humans have limited cognitive ability to process information; both of these types of refinement aim to reduce the mental load on @@ -126,9 +143,9 @@ understand and remember your message quickly. - Ensure the text, symbols, lines, etc., on your visualization are big enough to be easily read. - Ensure the data are clearly visible; don't hide the shape/distribution of the data behind other objects (e.g., a bar). - Make sure to use color schemes that are understandable by those with - colorblindness (a surprisingly large fraction of the overall + colorblindness (a surprisingly large fraction of the overall population—from about 1% to 10%, depending on sex and ancestry {cite:p}`deebblind`). - For example, [Color Schemes](https://vega.github.io/vega/docs/schemes/) + For example, [Color Schemes](https://altair-viz.github.io/user_guide/customization.html#customizing-colors) provides the ability to pick such color schemes, and you can check your visualizations after you have created them by uploading to online tools such as a [color blindness simulator](https://www.color-blindness.com/coblis-color-blindness-simulator/). @@ -136,7 +153,7 @@ understand and remember your message quickly. **Minimize noise** -- Use colors sparingly. Too many different colors can be distracting, create false patterns, and detract from the message. +- Use colors sparingly. Too many different colors can be distracting, create false patterns, and detract from the message. - Be wary of overplotting. Overplotting is when marks that represent the data overlap, and is problematic as it prevents you from seeing how many data points are represented in areas of the visualization where this occurs. If your @@ -147,14 +164,14 @@ understand and remember your message quickly. +++ -## Creating visualizations with `altair` +## Creating visualizations with `altair` #### *Build the visualization iteratively* ```{index} altair ``` -This section will cover examples of how to choose and refine a visualization given a data set and a question that you want to answer, -and then how to create the visualization in Python using `altair`. To use the `altair` package, we need to import the `altair` package. We will also import `pandas` in order to support reading and other data related operations. +This section will cover examples of how to choose and refine a visualization given a data set and a question that you want to answer, +and then how to create the visualization in Python using `altair`. To use the `altair` package, we need to import the `altair` package. We will also import `pandas` to use for reading in the data. ```{code-cell} ipython3 import pandas as pd @@ -172,12 +189,12 @@ from myst_nb import glue ```{index} Mauna Loa ``` -The [Mauna Loa CO$_{\text{2}}$ data set](https://www.esrl.noaa.gov/gmd/ccgg/trends/data.html), -curated by Dr. Pieter Tans, NOAA/GML +The [Mauna Loa CO$_{\text{2}}$ data set](https://www.esrl.noaa.gov/gmd/ccgg/trends/data.html), +curated by Dr. Pieter Tans, NOAA/GML and Dr. Ralph Keeling, Scripps Institution of Oceanography, -records the atmospheric concentration of carbon dioxide -(CO$_{\text{2}}$, in parts per million) -at the Mauna Loa research station in Hawaii +records the atmospheric concentration of carbon dioxide +(CO$_{\text{2}}$, in parts per million) +at the Mauna Loa research station in Hawaii from 1959 onward {cite:p}`maunadata`. For this book, we are going to focus on the last 40 years of the data set, 1980-2020. @@ -185,7 +202,7 @@ For this book, we are going to focus on the last 40 years of the data set, ```{index} question; visualization ``` -**Question:** Does the concentration of atmospheric CO$_{\text{2}}$ change over time, +**Question:** Does the concentration of atmospheric CO$_{\text{2}}$ change over time, and are there any interesting patterns to note? ```{code-cell} ipython3 @@ -197,63 +214,75 @@ mauna_loa = mauna_loa[['date_measured', 'ppm']].query('ppm>0 and date_measured>" mauna_loa.to_csv("data/mauna_loa_data.csv", index=False) ``` - - To get started, we will read and inspect the data: ```{code-cell} ipython3 # mauna loa carbon dioxide data -co2_df = pd.read_csv("data/mauna_loa_data.csv", parse_dates=['date_measured']) +co2_df = pd.read_csv( + "data/mauna_loa_data.csv", parse_dates=['date_measured'] +) co2_df ``` ```{code-cell} ipython3 -co2_df.dtypes +co2_df.info() ``` -We see that there are two columns in the `co2_df` data frame; `date_measured` and `ppm`. -The `date_measured` column holds the date the measurement was taken, +We see that there are two columns in the `co2_df` data frame; `date_measured` and `ppm`. +The `date_measured` column holds the date the measurement was taken, and is of type `datetime64`. -The `ppm` column holds the value of CO$_{\text{2}}$ in parts per million -that was measured on each date, and is type `float64`. +The `ppm` column holds the value of CO$_{\text{2}}$ in parts per million +that was measured on each date, and is type `float64`; this is the usual +type for decimal numbers. > **Note:** `read_csv` was able to parse the `date_measured` column into the -> `datetime` vector type because it was entered -> in the international standard date format, -> called ISO 8601, which lists dates as `year-month-day` and we used `parse_dates=True`. -> `datetime` vectors are `double` vectors with special properties that allow +> `datetime` vector type because it was entered +> in the international standard date format, +> called ISO 8601, which lists dates as `year-month-day` and we used `parse_dates=True`. +> `datetime` vectors are `double` vectors with special properties that allow > them to handle dates correctly. -> For example, `datetime` type vectors allow functions like `altair` -> to treat them as numeric dates and not as character vectors, -> even though they contain non-numeric characters +> For example, `datetime` type vectors allow functions like `altair` +> to treat them as numeric dates and not as character vectors, +> even though they contain non-numeric characters > (e.g., in the `date_measured` column in the `co2_df` data frame). -> This means Python will not accidentally plot the dates in the wrong order -> (i.e., not alphanumerically as would happen if it was a character vector). -> More about dates and times can be viewed [here](https://wesmckinney.com/book/time-series.html) - -Since we are investigating a relationship between two variables -(CO$_{\text{2}}$ concentration and date), -a scatter plot is a good place to start. -Scatter plots show the data as individual points with `x` (horizontal axis) +> This means Python will not accidentally plot the dates in the wrong order +> (i.e., not alphanumerically as would happen if it was a character vector). +> More about dates and times can be viewed [here](https://wesmckinney.com/book/time-series.html). + +Since we are investigating a relationship between two variables +(CO$_{\text{2}}$ concentration and date), +a scatter plot is a good place to start. +Scatter plots show the data as individual points with `x` (horizontal axis) and `y` (vertical axis) coordinates. -Here, we will use the measurement date as the `x` coordinate -and the CO$_{\text{2}}$ concentration as the `y` coordinate. -while using the `altair` package, We create a plot object with the `alt.Chart()` function. +Here, we will use the measurement date as the `x` coordinate +and the CO$_{\text{2}}$ concentration as the `y` coordinate. +We create a plot object with the `alt.Chart()` function. There are a few basic aspects of a plot that we need to specify: ```{index} altair; geometric object, altair; geometric encoding, geometric object, geometric encoding ``` - The name of the **data frame** object to visualize. - - Here, we specify the `co2_df` data frame as an argument to the `alt.Chart()` function + - Here, we specify the `co2_df` data frame as an argument to `alt.Chart` - The **geometric object**, which specifies how the mapped data should be displayed. - - To create a geometric object, we use `Chart.mark_*` methods (see the [altair reference](https://altair-viz.github.io/user_guide/marks.html) for a list of geometric objects). + - To create a geometric object, we use `Chart.mark_*` methods (see the + [altair reference](https://altair-viz.github.io/user_guide/marks.html) + for a list of geometric objects). - Here, we use the `mark_point` function to visualize our data as a scatter plot. - The **geometric encoding**, which tells `altair` how the columns in the data frame map to properties of the visualization. - - To create an encoding, we use the `encode()` function. - - The `encode()` method builds a key-value mapping between encoding channels (such as x, y) to fields in the dataset, accessed by field name(column names) - - Here, we set the plot `x` axis to the `date_measured` variable, and the plot `y` axis to the `ppm` variable. + - To create an encoding, we use the `encode` function. + - The `encode` method builds a key-value mapping between encoding channels (such as x, y) to fields in the dataset, accessed by field name (column names) + - Here, we set the `x` axis of the plot to the `date_measured` variable, + and on the `y` axis, we plot the `ppm` variable. We use `alt.X` and + `alt.Y` which allow you to control properties of the `x` and `y` axes. + - For the y-axis, we also provided the argument + `scale=alt.Scale(zero=False)`. By default, `altair` chooses the y-limits + based on the data and will keep `y=0` in view. That would make it + difficult to see any trends in our data since the smallest value is >300 + ppm. So by providing `scale=alt.Scale(zero=False)`, we tell altair to + choose a reasonable lower bound based on our data, and that lower bound + doesn't have to be zero. ```{code-cell} ipython3 :tags: ["remove-cell"] @@ -262,10 +291,9 @@ from myst_nb import glue ```{code-cell} ipython3 co2_scatter = alt.Chart(co2_df).mark_point().encode( - x = "date_measured", - y = alt.Y("ppm", scale=alt.Scale(zero=False))) - - + x=alt.X("date_measured"), + y=alt.Y("ppm", scale=alt.Scale(zero=False)) +) ``` ```{code-cell} ipython3 @@ -273,30 +301,27 @@ co2_scatter = alt.Chart(co2_df).mark_point().encode( glue('co2_scatter', co2_scatter, display=False) ``` -:::{glue:figure} co2_scatter -:figwidth: 700px +:::{glue:figure} co2_scatter +:figwidth: 700px :name: co2_scatter Scatter plot of atmospheric concentration of CO$_{2}$ over time. ::: - -> **Note:** We can change the size of the point and color of the plot by specifying `mark_point(size=10, color='black')`. - -Certainly, the visualization in {numref}`co2_scatter` -shows a clear upward trend +The visualization in {numref}`co2_scatter` +shows a clear upward trend in the atmospheric concentration of CO$_{\text{2}}$ over time. -This plot answers the first part of our question in the affirmative, -but that appears to be the only conclusion one can make -from the scatter visualization. +This plot answers the first part of our question in the affirmative, +but that appears to be the only conclusion one can make +from the scatter visualization. One important thing to note about this data is that one of the variables we are exploring is time. -Time is a special kind of quantitative variable -because it forces additional structure on the data—the -data points have a natural order. -Specifically, each observation in the data set has a predecessor -and a successor, and the order of the observations matters; changing their order +Time is a special kind of quantitative variable +because it forces additional structure on the data—the +data points have a natural order. +Specifically, each observation in the data set has a predecessor +and a successor, and the order of the observations matters; changing their order alters their meaning. In situations like this, we typically use a line plot to visualize the data. Line plots connect the sequence of `x` and `y` coordinates @@ -305,27 +330,25 @@ of the observations with line segments, thereby emphasizing their order. ```{index} altair; mark_line ``` -We can create a line plot in `altair` using the `mark_line` function. -Let's now try to visualize the `co2_df` as a line plot -with just the default arguments: +We can create a line plot in `altair` using the `mark_line` function. +Let's now try to visualize the `co2_df` as a line plot +with just the default arguments: ```{code-cell} ipython3 -co2_line = alt.Chart(co2_df).mark_line(color='black').encode( - x = "date_measured", - y = alt.Y("ppm", scale=alt.Scale(zero=False))) - - +co2_line = alt.Chart(co2_df).mark_line().encode( + x=alt.X("date_measured"), + y=alt.Y("ppm", scale=alt.Scale(zero=False)) +) ``` + ```{code-cell} ipython3 :tags: ["remove-cell"] glue('co2_line', co2_line, display=False) ``` - - :::{glue:figure} co2_line -:figwidth: 700px +:figwidth: 700px :name: co2_line Line plot of atmospheric concentration of CO$_{2}$ over time. @@ -342,7 +365,7 @@ be a better choice for answering the question than the scatter plot was. The comparison between these two visualizations also illustrates a common issue with scatter plots: often, the points are shown too close together or even on top of one another, muddling information that would otherwise be clear -(*overplotting*). +(*overplotting*). ```{index} altair; alt.X, altair; alt.Y, altair; configure_axis ``` @@ -352,14 +375,14 @@ to refine things. This plot is fairly straightforward, and there is not much visual noise to remove. But there are a few things we must do to improve clarity, such as adding informative axis labels and making the font a more readable size. To add axis labels, we use the `title` argument along with `alt.X` and `alt.Y` functions. To -change the font size, we use the `configure_axis` function with the `titleFontSize` argument: +change the font size, we use the `configure_axis` function with the +`titleFontSize` argument. ```{code-cell} ipython3 -co2_line_labels = alt.Chart(co2_df).mark_line(color='black').encode( - x = alt.X("date_measured", title = "Year"), - y = alt.Y("ppm", scale=alt.Scale(zero=False), title = "Atmospheric CO2 (ppm)")).configure_axis( - titleFontSize=12) - +co2_line_labels = alt.Chart(co2_df).mark_line().encode( + x=alt.X("date_measured", title="Year"), + y=alt.Y("ppm", scale=alt.Scale(zero=False), title="Atmospheric CO2 (ppm)") +).configure_axis(titleFontSize=12) ``` ```{code-cell} ipython3 @@ -368,13 +391,13 @@ glue('co2_line_labels', co2_line_labels, display=False) ``` :::{glue:figure} co2_line_labels -:figwidth: 700px +:figwidth: 700px :name: co2_line_labels Line plot of atmospheric concentration of CO$_{2}$ over time with clearer axes and labels. ::: -> **Note:** The `configure_` function in `altair` is complex and supports many other functionalities, which can be viewed [here](https://altair-viz.github.io/user_guide/configuration.html) +> **Note:** The `configure_*` function in `altair` supports many other functionalities for customizing visualizations, for example updating the size of the plot, changing the font color, or many other options that can be viewed [here](https://altair-viz.github.io/user_guide/configuration.html). ```{index} altair; alt.Scale ``` @@ -382,28 +405,35 @@ Line plot of atmospheric concentration of CO$_{2}$ over time with clearer axes a Finally, let's see if we can better understand the oscillation by changing the visualization slightly. Note that it is totally fine to use a small number of visualizations to answer different aspects of the question you are trying to -answer. We will accomplish this by using *scale*, +answer. We will accomplish this by using *scale*, another important feature of `altair` that easily transforms the different variables and set limits. We scale the horizontal axis using the `alt.Scale(domain=['1990', '1993'])` by restricting the x-axis values between 1990 and 1994, and the vertical axis with the `alt.Scale(zero=False)` function, to not start the y-axis with zero. -In particular, here, we will use the `alt.Scale()` function to zoom in -on just five years of data (say, 1990-1994). -`domain` argument takes a list of length two -to specify the upper and lower bounds to limit the axis. - -```{code-cell} ipython3 - - -co2_line_scale = alt.Chart(co2_df).mark_line(color='black', clip=True).encode( - x=alt.X("date_measured", title="Measurement Date", axis=alt.Axis(tickCount=4), scale=alt.Scale(domain=['1990', '1994'])), - y=alt.Y("ppm", scale=alt.Scale(zero=False), title="Atmospheric CO2 (ppm)") -).configure_axis( - titleFontSize=12 -) - - - - +In particular, here, we will use the `alt.Scale` function to zoom in +on just five years of data (say, 1990-1994). The +`domain` argument takes a list of length two +to specify the upper and lower bounds to limit the axis. +We also added the argument `clip=True` to `mark_line`. This tells `altair` +to "clip" the data outside of the domain that we set so that it doesn't +extend past the plot area. +Finally, we will use `axis=alt.Axis(tickCount=4)` to add the lines corresponding to each +year in the background to create the final visualization. This helps us to +better visualise the change with each year. + +```{code-cell} ipython3 +co2_line_scale = alt.Chart(co2_df).mark_line(clip=True).encode( + x=alt.X( + "date_measured", + title="Measurement Date", + axis=alt.Axis(tickCount=4), + scale=alt.Scale(domain=['1990', '1994']) + ), + y=alt.Y( + "ppm", + scale=alt.Scale(zero=False), + title="Atmospheric CO2 (ppm)" + ) +).configure_axis(titleFontSize=12) ``` ```{code-cell} ipython3 @@ -412,52 +442,55 @@ glue('co2_line_scale', co2_line_scale, display=False) ``` :::{glue:figure} co2_line_scale -:figwidth: 700px +:figwidth: 700px :name: co2_line_scale Line plot of atmospheric concentration of CO$_{2}$ from 1990 to 1994. ::: -Interesting! It seems that each year, the atmospheric CO$_{\text{2}}$ increases until it reaches its peak somewhere around April, decreases until around late September, -and finally increases again until the end of the year. In Hawaii, there are two seasons: summer from May through October, and winter from November through April. -Therefore, the oscillating pattern in CO$_{\text{2}}$ matches up fairly closely with the two seasons. +Interesting! It seems that each year, the atmospheric CO$_{\text{2}}$ increases +until it reaches its peak somewhere around April, decreases until around late +September, and finally increases again until the end of the year. In Hawaii, +there are two seasons: summer from May through October, and winter from +November through April. Therefore, the oscillating pattern in CO$_{\text{2}}$ +matches up fairly closely with the two seasons. -As you might have noticed from the code used to create the final visualization -of the `co2_df` data frame, we used `axis=alt.Axis(tickCount=4)` to add the lines in the background to better visualise and map the values on the axis to the plot. A useful analogy to constructing a data visualization is painting a picture. -We start with a blank canvas, -and the first thing we do is prepare the surface -for our painting by adding primer. -In our data visualization this is akin to calling `alt.Chart` +We start with a blank canvas, +and the first thing we do is prepare the surface +for our painting by adding primer. +In our data visualization this is akin to calling `alt.Chart` and specifying the data set we will be using. -Next, we sketch out the background of the painting. -In our data visualization, +Next, we sketch out the background of the painting. +In our data visualization, this would be when we map data to the axes in the `encode` function. Then we add our key visual subjects to the painting. -In our data visualization, +In our data visualization, this would be the geometric objects (e.g., `mark_point`, `mark_line`, etc.). And finally, we work on adding details and refinements to the painting. In our data visualization this would be when we fine tune axis labels, change the font, adjust the point size, and do other related things. + + ### Scatter plots: the Old Faithful eruption time data set ```{index} Old Faithful ``` -The `faithful` data set contains measurements -of the waiting time between eruptions +The `faithful` data set contains measurements +of the waiting time between eruptions and the subsequent eruption duration (in minutes) of the Old Faithful -geyser in Yellowstone National Park, Wyoming, United States. +geyser in Yellowstone National Park, Wyoming, United States. First, we will read the data and then answer the following question: ```{index} question; visualization ``` -**Question:** Is there a relationship between the waiting time before an eruption -and the duration of the eruption? +**Question:** Is there a relationship between the waiting time before an eruption +and the duration of the eruption? ```{code-cell} ipython3 faithful = pd.read_csv("data/faithful.csv") @@ -465,25 +498,25 @@ faithful ``` -Here again, we investigate the relationship between two quantitative variables -(waiting time and eruption time). -But if you look at the output of the data frame, +Here again, we investigate the relationship between two quantitative variables +(waiting time and eruption time). +But if you look at the output of the data frame, you'll notice that unlike time in the Mauna Loa CO$_{\text{2}}$ data set, neither of the variables here have a natural order to them. So a scatter plot is likely to be the most appropriate visualization. Let's create a scatter plot using the `altair` -package with the `waiting` variable on the horizontal axis, the `eruptions` +package with the `waiting` variable on the horizontal axis, the `eruptions` variable on the vertical axis, and the `mark_point` geometric object. +By default, `altair` draws only the outline of each point. If we would +like to fill them in, we pass the argument `filled=True` to `mark_point`. In +place of `mark_point(filled=True)`, we can also use `mark_circle`. The result is shown in {numref}`faithful_scatter`. - - ```{code-cell} ipython3 -faithful_scatter = alt.Chart(faithful).mark_point(color='black', filled=True).encode( - x = "waiting", - y = "eruptions" +faithful_scatter = alt.Chart(faithful).mark_point(filled=True).encode( + x="waiting", + y="eruptions" ) - ``` ```{code-cell} ipython3 @@ -491,8 +524,8 @@ faithful_scatter = alt.Chart(faithful).mark_point(color='black', filled=True).en glue('faithful_scatter', faithful_scatter, display=False) ``` -:::{glue:figure} faithful_scatter -:figwidth: 700px +:::{glue:figure} faithful_scatter +:figwidth: 700px :name: faithful_scatter Scatter plot of waiting time and eruption time. @@ -502,35 +535,51 @@ We can see in {numref}`faithful_scatter` that the data tend to fall into two groups: one with short waiting and eruption times, and one with long waiting and eruption times. Note that in this case, there is no overplotting: the points are generally nicely visually separated, and the pattern they form -is clear. Also, note that to make the points solid, we used `filled=True` as argument of the `mark_point` function. In place of `mark_point(filled=True)`, we can also use `mark_circle()`. +is clear. In order to refine the visualization, we need only to add axis -labels and make the font more readable: - - +labels and make the font more readable. ```{code-cell} ipython3 -faithful_scatter_labels = alt.Chart(faithful).mark_circle(color='black').encode( - x = alt.X("waiting", title = "Waiting Time (mins)"), - y = alt.Y("eruptions", title = "Eruption Duration (mins)") +faithful_scatter_labels = alt.Chart(faithful).mark_circle().encode( + x=alt.X("waiting", title="Waiting Time (mins)"), + y=alt.Y("eruptions", title="Eruption Duration (mins)") ) - - - ``` - ```{code-cell} ipython3 :tags: ["remove-cell"] -glue('faithful_scatter_labels', faithful_scatter_labels, display=False) +glue("faithful_scatter_labels", faithful_scatter_labels, display=False) ``` :::{glue:figure} faithful_scatter_labels -:figwidth: 700px +:figwidth: 700px :name: faithful_scatter_labels Scatter plot of waiting time and eruption time with clearer axes and labels. ::: + +We can change the size of the point and color of the plot by specifying `mark_circle(size=10, color="black")`. + +```{code-cell} ipython3 +faithful_scatter_labels_black = alt.Chart(faithful).mark_circle(size=10, color="black").encode( + x=alt.X("waiting", title="Waiting Time (mins)"), + y=alt.Y("eruptions", title="Eruption Duration (mins)") +) +``` + +```{code-cell} ipython3 +:tags: ["remove-cell"] +glue('faithful_scatter_labels_black', faithful_scatter_labels_black, display=False) +``` + +:::{glue:figure} faithful_scatter_labels_black +:figwidth: 700px +:name: faithful_scatter_labels_black + +Scatter plot of waiting time and eruption time with black points. +::: + +++ ### Axis transformation and colored scatter plots: the Canadian languages data set @@ -538,15 +587,15 @@ Scatter plot of waiting time and eruption time with clearer axes and labels. ```{index} Canadian languages ``` -Recall the `can_lang` data set {cite:p}`timbers2020canlang` from Chapters {ref}`intro`, {ref}`reading`, and {ref}`wrangling`, -which contains counts of languages from the 2016 +Recall the `can_lang` data set {cite:p}`timbers2020canlang` from the {ref}`intro`, {ref}`reading`, and {ref}`wrangling` chapters. +It contains counts of languages from the 2016 Canadian census. ```{index} question; visualization ``` **Question:** Is there a relationship between -the percentage of people who speak a language as their mother tongue and +the percentage of people who speak a language as their mother tongue and the percentage for whom that is the primary language spoken at home? And is there a pattern in the strength of this relationship in the higher-level language categories (Official languages, Aboriginal languages, or @@ -555,7 +604,9 @@ non-official and non-Aboriginal languages)? To get started, we will read and inspect the data: ```{code-cell} ipython3 -can_lang = pd.read_csv("data/can_lang.csv") +:tags: ["output_scroll"] +can_lang = pd.read_csv("data/can_lang.csv") +can_lang ``` ```{code-cell} ipython3 @@ -570,11 +621,10 @@ We will begin with a scatter plot of the `mother_tongue` and `most_at_home` colu The resulting plot is shown in {numref}`can_lang_plot` ```{code-cell} ipython3 - -can_lang_plot = alt.Chart(can_lang).mark_circle(color='black').encode( - x = "most_at_home", - y = "mother_tongue") - +can_lang_plot = alt.Chart(can_lang).mark_circle().encode( + x="most_at_home", + y="mother_tongue" +) ``` @@ -584,7 +634,7 @@ glue('can_lang_plot', can_lang_plot, display=False) ``` :::{glue:figure} can_lang_plot -:figwidth: 700px +:figwidth: 700px :name: can_lang_plot Scatter plot of number of Canadians reporting a language as their mother tongue vs the primary language at home @@ -593,21 +643,27 @@ Scatter plot of number of Canadians reporting a language as their mother tongue ```{index} escape character ``` -To make an initial improvement in the interpretability -of {numref}`can_lang_plot`, we should +To make an initial improvement in the interpretability +of {numref}`can_lang_plot`, we should replace the default axis names with more informative labels. We can add a line break in the axis names so that some of the words are printed on a new line. This will make the axes labels on the plots more readable. To do this, we pass the title as a list. Each element of the list will be on a new line. -We should also increase the font size to further +We should also increase the font size to further improve readability. ```{code-cell} ipython3 -can_lang_plot_labels = alt.Chart(can_lang).mark_circle(color='black').encode( - x = alt.X("most_at_home",title = ["Language spoken most at home", "(number of Canadian residents)"]), - y = alt.Y("mother_tongue", scale=alt.Scale(zero=False), title = ["Mother tongue", "(number of Canadian residents)"])).configure_axis( - titleFontSize=12) - +can_lang_plot_labels = alt.Chart(can_lang).mark_circle().encode( + x=alt.X( + "most_at_home", + title=["Language spoken most at home", "(number of Canadian residents)"] + ), + y=alt.Y( + "mother_tongue", + scale=alt.Scale(zero=False), + title=["Mother tongue", "(number of Canadian residents)"] + ) +).configure_axis(titleFontSize=12) ``` ```{code-cell} ipython3 @@ -616,7 +672,7 @@ glue('can_lang_plot_labels', can_lang_plot_labels, display=False) ``` :::{glue:figure} can_lang_plot_labels -:figwidth: 700px +:figwidth: 700px :name: can_lang_plot_labels Scatter plot of number of Canadians reporting a language as their mother tongue vs the primary language at home with x and y labels. @@ -628,11 +684,11 @@ Scatter plot of number of Canadians reporting a language as their mother tongue ```{code-cell} ipython3 :tags: ["remove-cell"] import numpy as np -numlang_speakers_max = max(can_lang['mother_tongue']) +numlang_speakers_max=int(max(can_lang['mother_tongue'])) print(numlang_speakers_max) -numlang_speakers_min = min(can_lang['mother_tongue']) +numlang_speakers_min = int(min(can_lang['mother_tongue'])) print(numlang_speakers_min) -log_result = np.floor(np.log(numlang_speakers_max/numlang_speakers_min)) +log_result = int(np.floor(np.log10(numlang_speakers_max/numlang_speakers_min))) print(log_result) glue("numlang_speakers_max", numlang_speakers_max) glue("numlang_speakers_min", numlang_speakers_min) @@ -644,11 +700,11 @@ much more readable and interpretable now. However, the scatter points themselves some work; most of the 214 data points are bunched up in the lower left-hand side of the visualization. The data is clumped because many more people in Canada speak English or French (the two points in -the upper right corner) than other languages. -In particular, the most common mother tongue language -has {glue:}`numlang_speakers_max` speakers, +the upper right corner) than other languages. +In particular, the most common mother tongue language +has {glue:}`numlang_speakers_max` speakers, while the least common has only {glue:}`numlang_speakers_min`. -That's a {glue:}`log_result` -decimal-place difference +That's a six-decimal-place difference in the magnitude of these two numbers! We can confirm that the two points in the upper right-hand corner correspond to Canada's two official languages by filtering the data: @@ -657,14 +713,18 @@ to Canada's two official languages by filtering the data: ``` ```{code-cell} ipython3 -can_lang.loc[(can_lang['language']=='English') | (can_lang['language']=='French')] +:tags: ["output_scroll"] +can_lang.loc[ + (can_lang['language']=='English') | + (can_lang['language']=='French') +] ``` ```{index} logarithmic scale, altair; logarithmic scaling ``` Recall that our question about this data pertains to *all* languages; -so to properly answer our question, +so to properly answer our question, we will need to adjust the scale of the axes so that we can clearly see all of the scatter points. In particular, we will improve the plot by adjusting the horizontal @@ -672,23 +732,32 @@ and vertical axes so that they are on a **logarithmic** (or **log**) scale. Log scaling is useful when your data take both *very large* and *very small* values, because it helps space out small values and squishes larger values together. For example, $\log_{10}(1) = 0$, $\log_{10}(10) = 1$, $\log_{10}(100) = 2$, and $\log_{10}(1000) = 3$; -on the logarithmic scale, +on the logarithmic scale, the values 1, 10, 100, and 1000 are all the same distance apart! -So we see that applying this function is moving big values closer together +So we see that applying this function is moving big values closer together and moving small values farther apart. -Note that if your data can take the value 0, logarithmic scaling may not +Note that if your data can take the value 0, logarithmic scaling may not be appropriate (since `log10(0) = -inf` in Python). There are other ways to transform -the data in such a case, but these are beyond the scope of the book. +the data in such a case, but these are beyond the scope of the book. We can accomplish logarithmic scaling in the `altair` visualization using the argument `type="log"` in the scale functions. ```{code-cell} ipython3 -can_lang_plot_log = alt.Chart(can_lang).mark_circle(color='black').encode( - x = alt.X("most_at_home",title = ["Language spoken most at home", "(number of Canadian residents)"], scale=alt.Scale( type="log"), axis=alt.Axis(tickCount=7)), - y = alt.Y("mother_tongue", title = ["Mother tongue", "(number of Canadian residents)"], scale=alt.Scale(type="log"), axis=alt.Axis(tickCount=7))).configure_axis( - titleFontSize=12) - +can_lang_plot_log = alt.Chart(can_lang).mark_circle().encode( + x=alt.X( + "most_at_home", + title=["Language spoken most at home", "(number of Canadian residents)"], + scale=alt.Scale(type="log"), + axis=alt.Axis(tickCount=7) + ), + y=alt.Y( + "mother_tongue", + title=["Mother tongue", "(number of Canadian residents)"], + scale=alt.Scale(type="log"), + axis=alt.Axis(tickCount=7) + ) +).configure_axis(titleFontSize=12) ``` ```{code-cell} ipython3 @@ -697,7 +766,7 @@ glue('can_lang_plot_log', can_lang_plot_log, display=False) ``` :::{glue:figure} can_lang_plot_log -:figwidth: 700px +:figwidth: 700px :name: can_lang_plot_log Scatter plot of number of Canadians reporting a language as their mother tongue vs the primary language at home with log adjusted x and y axes. @@ -716,16 +785,16 @@ glue("result", result) ``` -Similar to some of the examples in Chapter {ref}`wrangling`, -we can convert the counts to percentages to give them context +Similar to some of the examples in the chapter on {ref}`wrangling`, +we can convert the counts to percentages to give them context and make them easier to understand. -We can do this by dividing the number of people reporting a given language -as their mother tongue or primary language at home -by the number of people who live in Canada and multiplying by 100\%. -For example, -the percentage of people who reported that their mother tongue was English -in the 2016 Canadian census -was {glue:}`english_mother_tongue` / {glue:}`census_popn` $\times$ +We can do this by dividing the number of people reporting a given language +as their mother tongue or primary language at home +by the number of people who live in Canada and multiplying by 100\%. +For example, +the percentage of people who reported that their mother tongue was English +in the 2016 Canadian census +was {glue:}`english_mother_tongue` / {glue:}`census_popn` $\times$ `100` \% = {glue:}`result`\% Below we use `assign` to calculate the percentage of people reporting a given @@ -738,14 +807,16 @@ you can clearly see the mutated output from the table. ``` ```{code-cell} ipython3 -can_lang = can_lang.assign(mother_tongue_percent = (can_lang['mother_tongue'] / 35151728) * 100, - most_at_home_percent = (can_lang['most_at_home'] / 35151728) * 100) +can_lang = can_lang.assign( + mother_tongue_percent=(can_lang['mother_tongue']/35151728) * 100, + most_at_home_percent=(can_lang['most_at_home']/35151728) * 100 +) can_lang[['mother_tongue_percent', 'most_at_home_percent']] ``` Finally, we will edit the visualization to use the percentages we just computed -(and change our axis labels to reflect this change in +(and change our axis labels to reflect this change in units). {numref}`can_lang_plot_percent` displays the final result. @@ -753,11 +824,20 @@ the final result. ```{code-cell} ipython3 -can_lang_plot_percent = alt.Chart(can_lang).mark_circle(color='black').encode( - x = alt.X("most_at_home_percent",title = ["Language spoken most at home", "(number of Canadian residents)"], scale=alt.Scale(type="log"), axis=alt.Axis(tickCount=7)), - y = alt.Y("mother_tongue_percent", title = ["Mother tongue", "(number of Canadian residents)"], scale=alt.Scale(type="log"), axis=alt.Axis(tickCount=7))).configure_axis( - titleFontSize=12) - +can_lang_plot_percent = alt.Chart(can_lang).mark_circle().encode( + x=alt.X( + "most_at_home_percent", + title=["Language spoken most at home", "(percentage of Canadian residents)"], + scale=alt.Scale(type="log"), + axis=alt.Axis(tickCount=7) + ), + y=alt.Y( + "mother_tongue_percent", + title=["Mother tongue", "(percentage of Canadian residents)"], + scale=alt.Scale(type="log"), + axis=alt.Axis(tickCount=7) + ) +).configure_axis(titleFontSize=12) ``` ```{code-cell} ipython3 @@ -766,7 +846,7 @@ glue('can_lang_plot_percent', can_lang_plot_percent, display=False) ``` :::{glue:figure} can_lang_plot_percent -:figwidth: 700px +:figwidth: 700px :name: can_lang_plot_percent Scatter plot of percentage of Canadians reporting a language as their mother tongue vs the primary language at home. @@ -774,46 +854,46 @@ Scatter plot of percentage of Canadians reporting a language as their mother ton {numref}`can_lang_plot_percent` is the appropriate visualization to use to answer the first question in this section, i.e., -whether there is a relationship between the percentage of people who speak +whether there is a relationship between the percentage of people who speak a language as their mother tongue and the percentage for whom that is the primary language spoken at home. To fully answer the question, we need to use {numref}`can_lang_plot_percent` -to assess a few key characteristics of the data: +to assess a few key characteristics of the data: ```{index} relationship; positive negative none ``` -- **Direction:** if the y variable tends to increase when the x variable increases, then y has a **positive** relationship with x. If - y tends to decrease when x increases, then y has a **negative** relationship with x. If y does not meaningfully increase or decrease - as x increases, then y has **little or no** relationship with x. +- **Direction:** if the y variable tends to increase when the x variable increases, then y has a **positive** relationship with x. If + y tends to decrease when x increases, then y has a **negative** relationship with x. If y does not meaningfully increase or decrease + as x increases, then y has **little or no** relationship with x. ```{index} relationship; strong weak ``` - **Strength:** if the y variable *reliably* increases, decreases, or stays flat as x increases, - then the relationship is **strong**. Otherwise, the relationship is **weak**. Intuitively, + then the relationship is **strong**. Otherwise, the relationship is **weak**. Intuitively, the relationship is strong when the scatter points are close together and look more like a "line" or "curve" than a "cloud." ```{index} relationship; linear nonlinear ``` -- **Shape:** if you can draw a straight line roughly through the data points, the relationship is **linear**. Otherwise, it is **nonlinear**. +- **Shape:** if you can draw a straight line roughly through the data points, the relationship is **linear**. Otherwise, it is **nonlinear**. -In {numref}`can_lang_plot_percent`, we see that -as the percentage of people who have a language as their mother tongue increases, -so does the percentage of people who speak that language at home. +In {numref}`can_lang_plot_percent`, we see that +as the percentage of people who have a language as their mother tongue increases, +so does the percentage of people who speak that language at home. Therefore, there is a **positive** relationship between these two variables. Furthermore, because the points in {numref}`can_lang_plot_percent` are fairly close together, and the points look more like a "line" than a "cloud", -we can say that this is a **strong** relationship. -And finally, because drawing a straight line through these points in +we can say that this is a **strong** relationship. +And finally, because drawing a straight line through these points in {numref}`can_lang_plot_percent` would fit the pattern we observe quite well, we say that the relationship is **linear**. Onto the second part of our exploratory data analysis question! -Recall that we are interested in knowing whether the strength -of the relationship we uncovered +Recall that we are interested in knowing whether the strength +of the relationship we uncovered in {numref}`can_lang_plot_percent` depends on the higher-level language category (Official languages, Aboriginal languages, and non-official, non-Aboriginal languages). @@ -821,24 +901,34 @@ One common way to explore this is to color the data points on the scatter plot we have already created by group. For example, given that we have the higher-level language category for each language recorded in the 2016 Canadian census, we can color the points in -our previous +our previous scatter plot to represent each language's higher-level language category. Here we want to distinguish the values according to the `category` group with which they belong. We can add the argument `color` to the `encode` function, specifying that the `category` column should color the points. Adding this argument will color the points according to their group and add a legend at the side of the -plot. +plot. ```{code-cell} ipython3 -can_lang_plot_category = alt.Chart(can_lang).mark_circle().encode( - x = alt.X("most_at_home_percent", title = ["Language spoken most at home", "(number of Canadian residents)"], scale=alt.Scale(type="log"), axis=alt.Axis(tickCount=7)), - y = alt.Y("mother_tongue_percent", title = ["Mother tongue", "(number of Canadian residents)"], scale=alt.Scale(type="log"), axis=alt.Axis(tickCount=7)), - color = "category").configure_axis( - titleFontSize=12) +can_lang_plot_category=alt.Chart(can_lang).mark_circle().encode( + x=alt.X( + "most_at_home_percent", + title=["Language spoken most at home", "(percentage of Canadian residents)"], + scale=alt.Scale(type="log"), + axis=alt.Axis(tickCount=7) + ), + y=alt.Y( + "mother_tongue_percent", + title=["Mother tongue", "(percentage of Canadian residents)"], + scale=alt.Scale(type="log"), + axis=alt.Axis(tickCount=7) + ), + color="category" +).configure_axis(titleFontSize=12) ``` @@ -848,37 +938,50 @@ glue('can_lang_plot_category', can_lang_plot_category, display=False) ``` :::{glue:figure} can_lang_plot_category -:figwidth: 700px +:figwidth: 700px :name: can_lang_plot_category Scatter plot of percentage of Canadians reporting a language as their mother tongue vs the primary language at home colored by language category. ::: -The legend in {numref}`can_lang_plot_category` -takes up valuable plot area. -We can improve this by moving the legend title using the `alt.Legend` function +Another thing we can adjust is the location of the legend. +This is a matter of preference and not critical for the visualization. +We move the legend title using the `alt.Legend` function with the arguments `legendX`, `legendY` and `direction` -arguments of the `theme` function. -Here we set the `direction` to `"vertical"` so that the legend items remain -vertically stacked on top of each other. The default `direction` is horizontal, which won't work -not work well for this particular visualization -because the legend labels are quite long -and would run off the page if displayed this way. +arguments of the `theme` function. +Here we set the `direction` to `"vertical"` so that the legend items remain +vertically stacked on top of each other. The default `direction` is horizontal, which works well for many cases, but +for this particular visualization +because the legend labels are quite long, it is a bit cleaner if we move the +legend above the plot instead. ```{code-cell} ipython3 can_lang_plot_legend = alt.Chart(can_lang).mark_circle().encode( - x = alt.X("most_at_home_percent",title = ["Language spoken most at home", "(number of Canadian residents)"], scale=alt.Scale(type="log"),axis=alt.Axis(tickCount=7)), - y = alt.Y("mother_tongue_percent", title = ["Mother tongue", "(number of Canadian residents)"], scale=alt.Scale(type="log"), axis=alt.Axis(tickCount=7)), - color = alt.Color("category", legend=alt.Legend( - orient='none', - legendX=0, legendY=-90, - direction='vertical'))).configure_axis( - titleFontSize=12) - - + x=alt.X( + "most_at_home_percent", + title=["Language spoken most at home", "(percentage of Canadian residents)"], + scale=alt.Scale(type="log"), + axis=alt.Axis(tickCount=7) + ), + y=alt.Y( + "mother_tongue_percent", + title=["Mother tongue", "(percentage of Canadian residents)"], + scale=alt.Scale(type="log"), + axis=alt.Axis(tickCount=7) + ), + color=alt.Color( + "category", + legend=alt.Legend( + orient='none', + legendX=0, + legendY=-90, + direction='vertical' + ) + ) +).configure_axis(titleFontSize=12) ``` ```{code-cell} ipython3 @@ -887,49 +990,59 @@ glue('can_lang_plot_legend', can_lang_plot_legend, display=False) ``` :::{glue:figure} can_lang_plot_legend -:figwidth: 700px +:figwidth: 700px :name: can_lang_plot_legend Scatter plot of percentage of Canadians reporting a language as their mother tongue vs the primary language at home colored by language category with the legend edited. ::: In {numref}`can_lang_plot_legend`, the points are colored with -the default `altair` color palette. But what if you want to use different -colors? In Altair, there are many themes available, which can be viewed [here](https://vega.github.io/vega/docs/schemes/) - -To change the color scheme, +the default `altair` color palette. This is an appropriate choice for most situations. In Altair, there are many themes available, which can be viewed [in the documentation](https://altair-viz.github.io/user_guide/customization.html#customizing-colors). To change the color scheme, we add the `scheme` argument in the `scale` of the `color` argument in `altair` layer indicating the palette we want to use. ```{index} color palette; color blindness simulator ``` -You can use -this [color blindness simulator](https://www.color-blindness.com/coblis-color-blindness-simulator/) to check -if your visualizations -are color-blind friendly. - Below we pick the `"dark2"` theme, with the result shown in {numref}`can_lang_plot_theme` We also set the `shape` aesthetic mapping to the `category` variable as well; -this makes the scatter point shapes different for each category. This kind of +this makes the scatter point shapes different for each category. This kind of visual redundancy—i.e., conveying the same information with both scatter point color and shape—can further improve the clarity and accessibility of your visualization. - -> Note: We cannot use different shapes with `mark_circle`, it can only be used with `mark_point` +You can use +this [color blindness simulator](https://www.color-blindness.com/coblis-color-blindness-simulator/) to check +if your visualizations are color-blind friendly. +The default color palattes in `altair` are color-blind friendly (one more reason to stick with the defaults!). +Note that we are switching back to the use of `mark_point` so that +we can specify the `shape` attribute. This cannot be done with `mark_circle`. ```{code-cell} ipython3 can_lang_plot_theme = alt.Chart(can_lang).mark_point(filled=True).encode( - x = alt.X("most_at_home_percent",title = ["Language spoken most at home", "(number of Canadian residents)"], scale=alt.Scale(type="log"), axis=alt.Axis(tickCount=7)), - y = alt.Y("mother_tongue_percent", title = "Mother tongue(percentage of Canadian residents)", scale=alt.Scale(type="log"), axis=alt.Axis(tickCount=7)), - color = alt.Color("category", legend=alt.Legend( - orient='none', - legendX=0, legendY=-90, - direction='vertical'), - scale=alt.Scale(scheme='dark2')), - shape = "category").configure_axis( - titleFontSize=12) - + x=alt.X( + "most_at_home_percent", + title=["Language spoken most at home", "(percentage of Canadian residents)"], + scale=alt.Scale(type="log"), + axis=alt.Axis(tickCount=7) + ), + y=alt.Y( + "mother_tongue_percent", + title="Mother tongue(percentage of Canadian residents)", + scale=alt.Scale(type="log"), + axis=alt.Axis(tickCount=7) + ), + color=alt.Color( + "category", + legend=alt.Legend( + orient='none', + legendX=0, + legendY=-90, + direction='vertical' + ), + scale=alt.Scale(scheme='dark2') + ), + shape="category" +).configure_axis(titleFontSize=12) ``` ```{code-cell} ipython3 @@ -938,26 +1051,26 @@ glue('can_lang_plot_theme', can_lang_plot_theme, display=False) ``` :::{glue:figure} can_lang_plot_theme -:figwidth: 700px +:figwidth: 700px :name: can_lang_plot_theme Scatter plot of percentage of Canadians reporting a language as their mother tongue vs the primary language at home colored by language category with color-blind friendly colors. ::: -From the visualization in {numref}`can_lang_plot_theme`, -we can now clearly see that the vast majority of Canadians reported one of the official languages -as their mother tongue and as the language they speak most often at home. -What do we see when considering the second part of our exploratory question? +From the visualization in {numref}`can_lang_plot_theme`, +we can now clearly see that the vast majority of Canadians reported one of the official languages +as their mother tongue and as the language they speak most often at home. +What do we see when considering the second part of our exploratory question? Do we see a difference in the relationship between languages spoken as a mother tongue and as a primary language -at home across the higher-level language categories? +at home across the higher-level language categories? Based on {numref}`can_lang_plot_theme`, there does not appear to be much of a difference. -For each higher-level language category, -there appears to be a strong, positive, and linear relationship between -the percentage of people who speak a language as their mother tongue -and the percentage who speak it as their primary language at home. -The relationship looks similar regardless of the category. +For each higher-level language category, +there appears to be a strong, positive, and linear relationship between +the percentage of people who speak a language as their mother tongue +and the percentage who speak it as their primary language at home. +The relationship looks similar regardless of the category. Does this mean that this relationship is positive for all languages in the world? And further, can we use this data visualization on its own to predict how many people @@ -966,15 +1079,15 @@ it as their primary language at home? The answer to both these questions is "no!" However, with exploratory data analysis, we can create new hypotheses, ideas, and questions (like the ones at the beginning of this paragraph). Answering those questions often involves doing more complex analyses, and sometimes -even gathering additional data. We will see more of such complex analyses later on in -this book. +even gathering additional data. We will see more of such complex analyses later on in +this book. ### Bar plots: the island landmass data set ```{index} Island landmasses ``` -The `islands.csv` data set contains a list of Earth's landmasses as well as their area (in thousands of square miles) {cite:p}`islandsdata`. +The `islands.csv` data set contains a list of Earth's landmasses as well as their area (in thousands of square miles) {cite:p}`islandsdata`. ```{index} question; visualization ``` @@ -984,14 +1097,15 @@ The `islands.csv` data set contains a list of Earth's landmasses as well as thei To get started, we will read and inspect the data: ```{code-cell} ipython3 +:tags: ["output_scroll"] islands_df = pd.read_csv("data/islands.csv") islands_df ``` -Here, we have a data frame of Earth's landmasses, -and are trying to compare their sizes. -The right type of visualization to answer this question is a bar plot. -In a bar plot, the height of the bar represents the value of a summary statistic +Here, we have a data frame of Earth's landmasses, +and are trying to compare their sizes. +The right type of visualization to answer this question is a bar plot. +In a bar plot, the height of the bar represents the value of a summary statistic (usually a size, count, proportion or percentage). They are particularly useful for comparing summary statistics between different groups of a categorical variable. @@ -1000,13 +1114,13 @@ groups of a categorical variable. ``` We specify that we would like to use a bar plot -via the `mark_bar` function in `altair`. -The result is shown in {numref}`islands_bar` +via the `mark_bar` function in `altair`. +The result is shown in {numref}`islands_bar`. ```{code-cell} ipython3 islands_bar = alt.Chart(islands_df).mark_bar().encode( - x = "landmass", y = "size") - + x="landmass", y="size" +) ``` ```{code-cell} ipython3 @@ -1015,7 +1129,7 @@ glue('islands_bar', islands_bar, display=False) ``` :::{glue:figure} islands_bar -:figwidth: 700px +:figwidth: 400px :name: islands_bar Bar plot of all Earth's landmasses' size with squished labels. @@ -1024,59 +1138,56 @@ Bar plot of all Earth's landmasses' size with squished labels. Alright, not bad! The plot in {numref}`islands_bar` is definitely the right kind of visualization, as we can clearly see and compare sizes of landmasses. The major issues are that the smaller landmasses' sizes -are hard to distinguish, and the names of the landmasses are tilted by default to fit in the labels. But remember that the +are hard to distinguish, and the plot is so wide that we can't compare them all! But remember that the question we asked was only about the largest landmasses; let's make the plot a little bit clearer by keeping only the largest 12 landmasses. We do this using -the `sort_values` function followed by the `iloc` property. Then to help us make sure the labels have enough +the `nlargest` function; the first argument is the number of rows we want and +the second is the name of the column we want to use for comparing who is +largest. Then to help us make sure the labels have enough space, we'll use horizontal bars instead of vertical ones. We do this by -swapping the `x` and `y` variables: +swapping the `x` and `y` variables. -```{index} pandas.DataFrame; sort_values, pandas.DataFrame; iloc[] +```{index} pandas.DataFrame; nlargest ``` ```{code-cell} ipython3 -islands_top12 = islands_df.sort_values(by = "size", ascending=False).iloc[:12] +islands_top12 = islands_df.nlargest(12, "size") -islands_bar_sorted = alt.Chart(islands_top12).mark_bar().encode( - x = "size", y = "landmass") +islands_bar_top = alt.Chart(islands_top12).mark_bar().encode( + x="size", y="landmass" +) ``` ```{code-cell} ipython3 :tags: ["remove-cell"] -glue('islands_bar_sorted', islands_bar_sorted, display=True) +glue('islands_bar_top', islands_bar_top, display=True) ``` -:::{glue:figure} islands_bar_sorted -:figwidth: 700px -:name: islands_bar_sorted +:::{glue:figure} islands_bar_top +:figwidth: 700px +:name: islands_bar_top Bar plot of size for Earth's largest 12 landmasses. ::: - - -The plot in {numref}`islands_bar_sorted` is definitely clearer now, -and allows us to answer our question -("are the top 7 largest landmasses continents?") in the affirmative. -But the question could be made clearer from the plot +The plot in {numref}`islands_bar_top` is definitely clearer now, +and allows us to answer our question +("Which are the top 7 largest landmasses continents?") in the affirmative. +But the question could be made clearer from the plot by organizing the bars not by alphabetical order -but by size, and to color them based on whether they are a continent. -The data for this is stored in the `landmass_type` column. -To use this to color the bars, +but by size, and to color them based on whether they are a continent. +The data for this is stored in the `landmass_type` column. +To use this to color the bars, we use the `color` argument to color the bars according to the `landmass_type` -To organize the landmasses by their `size` variable, +To organize the landmasses by their `size` variable, we will use the `altair` `sort` function in encoding for `y` axis to organize the landmasses by their `size` variable, which is encoded on the x-axis. To sort the landmasses by their size(denoted on `x` axis), we use `sort='x'`. This plots the values on `y` axis -in the ascending order of `x` axis values. - +in the ascending order of `x` axis values. We do this here so that the largest bar will be closest to the axis line, -which is more visually appealing. - -> **Note:** If we want to sort the values on `y-axis` in descending order of `x-axis`, -> we need to specify `sort='-x'`. +which is more visually appealing. If instead, we want to sort the values on `y-axis` in descending order of `x-axis`, we need to specify `sort='-x'`. ```{index} altair; sort ``` @@ -1085,16 +1196,16 @@ To label the x and y axes, we will use the `alt.X` and `alt.Y` function The default label is the name of the column being mapped to `color`. Here that would be `landmass_type`; however `landmass_type` is not proper English (and so is less readable). -Thus we use the `title` argument inside `alt.Color` to change that to "Type" -Finally, we again use the `configure_axis` function +Thus we use the `title` argument inside `alt.Color` to change that to `"Type"`. +Finally, we again use the `configure_axis` function to change the font size. ```{code-cell} ipython3 -islands_plot_sorted = alt.Chart(islands_top12).mark_bar(color='black').encode( - x = alt.X("size",title = "Size (1000 square mi)"), - y = alt.Y("landmass", title = "Landmass", sort='x'), - color = alt.Color("landmass_type", title = "Type")).configure_axis( - titleFontSize=12) +islands_plot_sorted = alt.Chart(islands_top12).mark_bar().encode( + x=alt.X("size",title="Size (1000 square mi)"), + y=alt.Y("landmass", title="Landmass", sort="x"), + color=alt.Color("landmass_type", title="Type") +).configure_axis(titleFontSize=12) ``` ```{code-cell} ipython3 @@ -1103,7 +1214,7 @@ glue('islands_plot_sorted', islands_plot_sorted, display=True) ``` :::{glue:figure} islands_plot_sorted -:figwidth: 700px +:figwidth: 700px :name: islands_plot_sorted Bar plot of size for Earth's largest 12 landmasses colored by whether its a continent with clearer axes and labels. @@ -1114,166 +1225,202 @@ The plot in {numref}`islands_plot_sorted` is now a very effective visualization for answering our original questions. Landmasses are organized by their size, and continents are colored differently than other landmasses, making it quite clear that continents are the largest seven landmasses. +We can make one more finishing touch in {numref}`islands_plot_titled`: we will +add a title to the chart by specifying `title` argument in the `alt.Chart` function. +Note that plot titles are not always required; usually plots appear as part +of other media (e.g., in a slide presentation, on a poster, in a paper) where +the title may be redundant with the surrounding context. + +```{code-cell} ipython3 +islands_plot_titled = alt.Chart(islands_top12, title="Largest 12 landmasses on Earth").mark_bar().encode( + x=alt.X("size",title="Size (1000 square mi)"), + y=alt.Y("landmass", title="Landmass", sort="x"), + color=alt.Color("landmass_type", title="Type") +).configure_axis(titleFontSize=12) +``` + +```{code-cell} ipython3 +:tags: ["remove-cell"] +glue('islands_plot_titled', islands_plot_titled, display=True) +``` + +:::{glue:figure} islands_plot_titled +:figwidth: 700px +:name: islands_plot_titled + +Bar plot of size for Earth's largest 12 landmasses with a title. +::: ### Histograms: the Michelson speed of light data set ```{index} Michelson speed of light ``` -The `morley` data set -contains measurements of the speed of light +The `morley` data set +contains measurements of the speed of light collected in experiments performed in 1879. -Five experiments were performed, -and in each experiment, 20 runs were performed—meaning that -20 measurements of the speed of light were collected +Five experiments were performed, +and in each experiment, 20 runs were performed—meaning that +20 measurements of the speed of light were collected in each experiment {cite:p}`lightdata`. - -Because the speed of light is a very large number +Because the speed of light is a very large number (the true value is 299,792.458 km/sec), the data is coded to be the measured speed of light minus 299,000. This coding allows us to focus on the variations in the measurements, which are generally much smaller than 299,000. If we used the full large speed measurements, the variations in the measurements would not be noticeable, making it difficult to study the differences between the experiments. -Note that we convert the `morley` data to a tibble to take advantage of the nicer print output -these specialized data frames provide. ```{index} question; visualization ``` -**Question:** Given what we know now about the speed of +**Question:** Given what we know now about the speed of light (299,792.458 kilometres per second), how accurate were each of the experiments? First, we read in the data. ```{code-cell} ipython3 morley_df = pd.read_csv("data/morley.csv") +morley_df ``` ```{index} distribution, altair; histogram ``` -In this experimental data, -Michelson was trying to measure just a single quantitative number -(the speed of light). -The data set contains many measurements of this single quantity. -To tell how accurate the experiments were, -we need to visualize the distribution of the measurements -(i.e., all their possible values and how often each occurs). -We can do this using a *histogram*. -A histogram -helps us visualize how a particular variable is distributed in a data set -by separating the data into bins, -and then using vertical bars to show how many data points fell in each bin. +In this experimental data, +Michelson was trying to measure just a single quantitative number +(the speed of light). +The data set contains many measurements of this single quantity. +To tell how accurate the experiments were, +we need to visualize the distribution of the measurements +(i.e., all their possible values and how often each occurs). +We can do this using a *histogram*. +A histogram +helps us visualize how a particular variable is distributed in a data set +by separating the data into bins, +and then using vertical bars to show how many data points fell in each bin. To create a histogram in `altair` we will use the `mark_bar` geometric -object, setting the `x` axis to the `Speed` measurement variable and `y` axis to `count()`. As usual, +object, setting the `x` axis to the `Speed` measurement variable and `y` axis to `"count()"`. +There is no `"count()"` column-name in `morley_df`; we use `"count()"` to tell `altair` +that we want to count the number of values in the `Speed` column in each bin. +As usual, let's use the default arguments just to see how things look. ```{code-cell} ipython3 morley_hist = alt.Chart(morley_df).mark_bar().encode( - x = alt.X("Speed"), - y='count()') + x=alt.X("Speed"), + y=alt.Y("count()") +) ``` ```{code-cell} ipython3 :tags: ["remove-cell"] -glue('morley_hist', morley_hist, display=False) +glue("morley_hist", morley_hist, display=False) ``` :::{glue:figure} morley_hist -:figwidth: 700px +:figwidth: 700px :name: morley_hist Histogram of Michelson's speed of light data. ::: -```{index} altair; mark_rule +#### Adding layers to an `altair` plot object + +```{index} altair; +; mark_rule ``` -{numref}`morley_hist` is a great start. -However, -we cannot tell how accurate the measurements are using this visualization +{numref}`morley_hist` is a great start. +However, +we cannot tell how accurate the measurements are using this visualization unless we can see the true value. -In order to visualize the true speed of light, +In order to visualize the true speed of light, we will add a vertical line with the `mark_rule` function. -To draw a vertical line with `mark_rule`, -we need to specify where on the x-axis the line should be drawn. -We can do this by creating a dataframe with just one column with value `792.458`, which is the true value of light speed -minus 299,000 and encoding it in the `x` axis; this ensures it is coded the same way as the -measurements in the `morley` data frame. -We would also like to fine tune this vertical line, -styling it so that it is dashed and 1 point in thickness. -A point is a measurement unit commonly used with fonts, -and 1 point is about 0.353 mm. -We do this by setting `strokeDash=[3,3]` and `size = 1`, respectively. - -Similarly, a horizontal line can be plotted using the `y` axis encoding and the dataframe with one value, which would act as the be the y-intercept - -Note that -*vertical lines* are used to denote quantities on the *horizontal axis*, -while *horizontal lines* are used to denote quantities on the *vertical axis*. - -To add the dashed line on top of the histogram, we will use the `+` operator. This concept is also known as layering in altair.(This is covered in the later sections of the chapter). Here, we add the `mark_rule` chart on the `morley_hist` chart of the form `mark_bar` +To draw a vertical line with `mark_rule`, +we need to specify where on the x-axis the line should be drawn. +We can do this by providing `x=alt.datum(792.458)`. The value `792.458` +is the true value of light speed +minus 299,000. Using `alt.datum` tells altair that we have a single datum +(number) that we would like plotted. +We would also like to fine tune this vertical line, +styling it so that it is dashed, +we do this by setting `strokeDash=[3]`. Note that you could also +change the thickness of the line by providing `size=2` if you wanted to. +Similarly, a horizontal line can be plotted using the `y` axis encoding and +the dataframe with one value, which would act as the be the y-intercept. +Note that +*vertical lines* are used to denote quantities on the *horizontal axis*, +while *horizontal lines* are used to denote quantities on the *vertical axis*. + +To add the dashed line on top of the histogram, we +**add** the `mark_rule` chart to the `morley_hist` +using the `+` operator. +Adding features to a plot using the `+` operator is known as *layering* in `altair`. +This is a very powerful feature of `altair`; you +can continue to iterate on a single plot object, adding and refining +one layer at a time. If you stored your plot as a named object +using the assignment symbol (`=`), you can add to it using the `+` operator. +Below we add a vertical line created using `mark_rule` +to the last plot we created, `morley_hist`, using the `+` operator. ```{code-cell} ipython3 -v_line = alt.Chart(pd.DataFrame({'x': [792.458]})).mark_rule( - strokeDash=[3,3], size=1).encode( - x='x') - - -final_plot = morley_hist + v_line +v_line = alt.Chart().mark_rule(strokeDash=[3]).encode( + x=alt.datum(792.458) +) +morley_hist_line = morley_hist + v_line ``` ```{code-cell} ipython3 :tags: ["remove-cell"] -glue('final_plot_viz', final_plot, display=False) +glue("morley_hist_line", morley_hist_line, display=False) ``` -:::{glue:figure} final_plot_viz -:figwidth: 700px -:name: final_plot_viz +:::{glue:figure} morley_hist_line +:figwidth: 700px +:name: morley_hist_line Histogram of Michelson's speed of light data with vertical line indicating true speed of light. ::: -In {numref}`final_plot_viz`, -we still cannot tell which experiments (denoted in the `Expt` column) -led to which measurements; -perhaps some experiments were more accurate than others. -To fully answer our question, -we need to separate the measurements from each other visually. -We can try to do this using a *colored* histogram, -where counts from different experiments are stacked on top of each other -in different colors. -We can create a histogram colored by the `Expt` variable -by adding it to the `color` argument. -We make sure the different colors can be seen -(despite them all sitting on top of each other) -by setting the `opacity` argument in `mark_bar` to `0.5` -to make the bars slightly translucent. +In {numref}`morley_hist_line`, +we still cannot tell which experiments (denoted by the `Expt` column) +led to which measurements; +perhaps some experiments were more accurate than others. +To fully answer our question, +we need to separate the measurements from each other visually. +We can try to do this using a *colored* histogram, +where counts from different experiments are stacked on top of each other +in different colors. +We can create a histogram colored by the `Expt` variable +by adding it to the `color` argument. +We make sure the different colors can be seen +(despite them all sitting on top of each other) +by setting the `opacity` argument in `mark_bar` to `0.5` +to make the bars slightly translucent. ```{code-cell} ipython3 morley_hist_colored = alt.Chart(morley_df).mark_bar(opacity=0.5).encode( - x = alt.X("Speed"), - y=alt.Y('count()'), - color = "Expt") + x=alt.X("Speed"), + y=alt.Y("count()"), + color=alt.Color("Expt") +) -final_plot_colored = morley_hist_colored + v_line +morley_hist_colored = morley_hist_colored + v_line ``` ```{code-cell} ipython3 :tags: ["remove-cell"] -glue('final_plot_colored', final_plot_colored, display=True) +glue('morley_hist_colored', morley_hist_colored, display=True) ``` -:::{glue:figure} final_plot_colored -:figwidth: 700px -:name: final_plot_colored +:::{glue:figure} morley_hist_colored +:figwidth: 700px +:name: morley_hist_colored Histogram of Michelson's speed of light data colored by experiment. ::: @@ -1281,48 +1428,52 @@ Histogram of Michelson's speed of light data colored by experiment. ```{index} integer ``` -Alright great, {numref}`final_plot_colored` looks...wait a second! We are not able to distinguish -between different Experiments in the histogram! What is going on here? Well, if you -recall from Chapter {ref}`wrangling`, the *data type* you use for each variable +Alright great, {numref}`morley_hist_colored` looks... wait a second! We are not able to distinguish +between different Experiments in the histogram! What is going on here? Well, if you +recall from the {ref}`wrangling` chapter, the *data type* you use for each variable can influence how Python and `altair` treats it. Here, we indeed have an issue with the data types in the `morley` data frame. In particular, the `Expt` column -is currently an *integer*. But we want to treat it as a -*category*, i.e., there should be one category per type of experiment. +is currently an *integer*---specifically, an `int64` type. But we want to treat it as a +*category*, i.e., there should be one category per type of experiment. +```{code-cell} ipython3 +morley_df.info() +``` ```{index} nominal, altair; :N ``` -To fix this issue we can convert the `Expt` variable into a `nominal`(categorical) type -variable by adding a suffix `:N`(where `N` stands for nominal type variable) with the `Expt` variable. -By doing this, we are ensuring that `altair` will treat this variable as a categorical variable, -and the color will be mapped discretely. Here, we also mention `stack=False`, so that the bars are not stacked on top of each other. +To fix this issue we can convert the `Expt` variable into a `nominal` +(i.e., categorical) type variable by adding a suffix `:N` +to the `Expt` variable. Adding the `:N` suffix ensures that `altair` +will treat a variable as a categorical variable, and +hence use a discrete color map in visualizations. +We also specify the `stack=False` argument in the `y` encoding so +that the bars are not stacked on top of each other. ```{code-cell} ipython3 morley_hist_categorical = alt.Chart(morley_df).mark_bar(opacity=0.5).encode( - x = alt.X("Speed", bin=alt.Bin(maxbins=50)), - y=alt.Y('count()', stack=False), - color = "Expt:N") - -final_plot_categorical = morley_hist_categorical + v_line + x=alt.X("Speed", bin=alt.Bin(maxbins=50)), + y=alt.Y("count()", stack=False), + color=alt.Color("Expt:N") +) +morley_hist_categorical = morley_hist_categorical + v_line ``` ```{code-cell} ipython3 :tags: ["remove-cell"] -glue('final_plot_categorical', final_plot_categorical, display=True) +glue('morley_hist_categorical', morley_hist_categorical, display=True) ``` -:::{glue:figure} final_plot_categorical -:figwidth: 700px -:name: final_plot_categorical +:::{glue:figure} morley_hist_categorical +:figwidth: 700px +:name: morley_hist_categorical Histogram of Michelson's speed of light data colored by experiment as a categorical variable. ::: - - Unfortunately, the attempt to separate out the experiment number visually has -created a bit of a mess. All of the colors in {numref}`final_plot_categorical` are blending together, and although it is +created a bit of a mess. All of the colors in {numref}`morley_hist_categorical` are blending together, and although it is possible to derive *some* insight from this (e.g., experiments 1 and 3 had some of the most incorrect measurements), it isn't the clearest way to convey our message and answer the question. Let's try a different strategy of creating @@ -1333,43 +1484,52 @@ grid of separate histogram plots. ```{index} altair; facet ``` -We use the `facet` function to create a plot +We use the `facet` function to create a plot that has multiple subplots arranged in a grid. -The argument to `facet` specifies the variable(s) used to split the plot -into subplots, and how to split them (i.e., into rows or columns). -If the plot is to be split horizontally, into rows, -then the `rows` argument is used. -If the plot is to be split vertically, into columns, -then the `columns` argument is used. -Both the `rows` and `columns` arguments take the column names on which to split the data when creating the subplots. - -```{code-cell} ipython3 - -morley_hist = alt.Chart(morley_df).mark_bar(opacity = 0.5).encode( - x = alt.X("Speed", bin=alt.Bin(maxbins=50)), - y=alt.Y('count()', stack=False), - color = "Expt:N").properties(height=100, width=300) - -final_plot_facet = (morley_hist + v_line).facet(row = 'Expt:N', data = morley_df) - +The argument to `facet` specifies the variable(s) used to split the plot +into subplots (`Expt`), the data frame we are working with `morley_df`, and +how to split them (i.e., into rows or columns). In this example, we choose to +have our plots in a single column (`columns=1`). This makes it easier for +us to compare along the `x`-axis as our vertical-line is in the same +horizontal position. If instead you wanted to use a single row, you could +specify `rows=1`. + +There is another important change we have to make. When +we define `morley_hist`, we no longer supply `morley_df` as an +argument to `alt.Chart`. This is because `facet` takes care of separating +the data by `Expt` and providing it to each of the facet sub-plots. + +```{code-cell} ipython3 + +morley_hist = alt.Chart().mark_bar(opacity=0.5).encode( + x=alt.X("Speed", bin=alt.Bin(maxbins=50)), + y=alt.Y("count()", stack=False), + color=alt.Color("Expt:N") +).properties(height=100, width=400) + +morley_hist_facet = (morley_hist + v_line).facet( + "Expt", + data=morley_df, + columns=1 +) ``` ```{code-cell} ipython3 :tags: ["remove-cell"] -glue('final_plot_facet', final_plot_facet, display=True) +glue('morley_hist_facet', morley_hist_facet, display=True) ``` -:::{glue:figure} final_plot_facet -:figwidth: 700px -:name: final_plot_facet +:::{glue:figure} morley_hist_facet +:figwidth: 700px +:name: morley_hist_facet Histogram of Michelson's speed of light data split vertically by experiment. ::: -The visualization in {numref}`final_plot_facet` -now makes it quite clear how accurate the different experiments were -with respect to one another. -The most variable measurements came from Experiment 1. +The visualization in {numref}`morley_hist_facet` +now makes it quite clear how accurate the different experiments were +with respect to one another. +The most variable measurements came from Experiment 1. There the measurements ranged from about 650–1050 km/sec. The least variable measurements came from Experiment 2. There, the measurements ranged from about 750–950 km/sec. @@ -1378,76 +1538,104 @@ The most different experiments still obtained quite similar results! ```{index} altair; alt.X, altair; alt.Y, altair; configure_axis ``` -There are two finishing touches to make this visualization even clearer. First and foremost, we need to add informative axis labels -using the `alt.X` and `alt.Y` function, and increase the font size to make it readable using the `configure_axis` function. Second, and perhaps more subtly, even though it -is easy to compare the experiments on this plot to one another, it is hard to get a sense -of just how accurate all the experiments were overall. For example, how accurate is the value 800 on the plot, relative to the true speed of light? -To answer this question, we'll use the assign function to transform our data into a relative measure of accuracy rather than absolute measurements: +There are three finishing touches to make this visualization even clearer. +First and foremost, we need to add informative axis labels using the `alt.X` +and `alt.Y` function, and increase the font size to make it readable using the +`configure_axis` function. We can also add a title; for a `facet` plot, this is +done by providing the `title` to the facet function. Finally, and perhaps most +subtly, even though it is easy to compare the experiments on this plot to one +another, it is hard to get a sense of just how accurate all the experiments +were overall. For example, how accurate is the value 800 on the plot, relative +to the true speed of light? To answer this question, we'll use the `assign` +function to transform our data into a relative measure of accuracy rather than +absolute measurements. ```{code-cell} ipython3 morley_rel = morley_df -morley_rel = morley_rel.assign(relative_accuracy = 100 * - ((299000 + morley_df['Speed']) - 299792.458) / (299792.458) ) +morley_rel = morley_rel.assign( + relative_accuracy=( + 100 *((299000 + morley_df["Speed"]) - 299792.458) / (299792.458) + ) +) morley_rel ``` ```{code-cell} ipython3 -v_line = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule( - strokeDash=[3,3], size=2).encode( - x='x') -morley_hist = alt.Chart().mark_bar(opacity=0.6).encode( - x = alt.X("relative_accuracy", bin=alt.Bin(maxbins=120), title = "Relative Accuracy (%)"), - y=alt.Y('count()', stack=False, title = "# Measurements"), - color = alt.Color("Expt:N", title = "Experiment ID")).properties(height=100, width= 400) +v_line = alt.Chart().mark_rule( + strokeDash=[3]).encode( + x=alt.datum(0) +) -final_plot_relative = (morley_hist + v_line).facet(row='Expt:N', data=morley_rel) +morley_hist = alt.Chart().mark_bar(opacity=0.6).encode( + x=alt.X( + "relative_accuracy", + bin=alt.Bin(maxbins=120), + title="Relative Accuracy (%)" + ), + y=alt.Y( + "count()", + stack=False, + title="# Measurements" + ), + color=alt.Color( + "Expt:N", + title="Experiment ID" + ) +).properties(height=100, width=400) + +morley_hist_relative = (morley_hist + v_line).facet( + "Expt", + data=morley_rel, + columns=1, + title="Histogram of relative accuracy of Michelson’s speed of light data" +) ``` ```{code-cell} ipython3 :tags: ["remove-cell"] -glue('final_plot_relative', final_plot_relative, display=True) +glue("morley_hist_relative", morley_hist_relative, display=True) ``` -:::{glue:figure} final_plot_relative -:figwidth: 700px -:name: final_plot_relative +:::{glue:figure} morley_hist_relative +:figwidth: 700px +:name: morley_hist_relative Histogram of relative accuracy split vertically by experiment with clearer axes and labels ::: -Wow, impressive! These measurements of the speed of light from 1879 had errors around *0.05%* of the true speed. {numref}`final_plot_relative` shows you that even though experiments 2 and 5 were perhaps the most accurate, all of the experiments did quite an -admirable job given the technology available at the time. +Wow, impressive! These measurements of the speed of light from 1879 had errors +around *0.05%* of the true speed. {numref}`morley_hist_relative` shows you that +even though experiments 2 and 5 were perhaps the most accurate, all of the +experiments did quite an admirable job given the technology available at the time. #### Choosing a binwidth for histograms -When you create a histogram in `altair`, the default number of bins used is 30. +When you create a histogram in `altair`, by default, it tries to choose a reasonable number of bins. Naturally, this is not always the right number to use. You can set the number of bins yourself by using the `maxbins` argument in the `mark_bar` geometric object. - -But what number of bins is the right one to use? +But what number of bins is the right one to use? Unfortunately there is no hard rule for what the right bin number -or width is. It depends entirely on your problem; the *right* number of bins -or bin width is -the one that *helps you answer the question* you asked. -Choosing the correct setting for your problem +or width is. It depends entirely on your problem; the *right* number of bins +or bin width is +the one that *helps you answer the question* you asked. +Choosing the correct setting for your problem is something that commonly takes iteration. - It's usually a good idea to try out several `maxbins` to see which one most clearly captures your data in the context of the question you want to answer. -To get a sense for how different bin affect visualizations, +To get a sense for how different bin affect visualizations, let's experiment with the histogram that we have been working on in this section. -In {numref}`final_plot_max_bins`, -we compare the default setting with three other histograms where we set the +In {numref}`morley_hist_max_bins`, +we compare the default setting with three other histograms where we set the `maxbins` to 200, 70 and 5. -In this case, we can see that both the default number of bins -and the `maxbins=70` of are effective for helping answer our question. +In this case, we can see that both the default number of bins +and the `maxbins=70` of are effective for helping to answer our question. On the other hand, the `maxbins=200` and `maxbins=5` are too small and too big, respectively. @@ -1456,117 +1644,94 @@ On the other hand, the `maxbins=200` and `maxbins=5` are too small and too big, ```{code-cell} ipython3 :tags: ["remove-cell"] -morley_hist_default = alt.Chart(morley_rel).mark_bar(opacity=0.6).encode( - x = alt.X("relative_accuracy", title = "Relative Accuracy (%)"), - y=alt.Y('count()', stack=False, title = "# Measurements"), - color = alt.Color("Expt:N", title = "Experiment ID")).properties(height=100, width=400) - -morley_hist_200 = alt.Chart(morley_rel).mark_bar(opacity=0.6).encode( - x = alt.X("relative_accuracy", bin=alt.Bin(maxbins=200), title = "Relative Accuracy (%)"), - y=alt.Y('count()', stack=False, title = "# Measurements"), - color = alt.Color("Expt:N", title = "Experiment ID")).properties(height=100, width=400) -morley_hist_70 = alt.Chart(morley_rel).mark_bar(opacity=0.6).encode( - x = alt.X("relative_accuracy", bin=alt.Bin(maxbins=70), title = "Relative Accuracy (%)"), - y=alt.Y('count()', stack=False, title = "# Measurements"), - color = alt.Color("Expt:N", title = "Experiment ID")).properties(height=100, width=400) - -morley_hist_5 = alt.Chart(morley_rel).mark_bar(opacity=0.6).encode( - x = alt.X("relative_accuracy", bin=alt.Bin(maxbins=5), title = "Relative Accuracy (%)"), - y=alt.Y('count()', stack=False, title = "# Measurements"), - color = alt.Color("Expt:N", title = "Experiment ID")).properties(height=100, width=300) - - - - - -final_plot_max_bins = ((morley_hist_default + v_line).facet(row='Expt:N', data=morley_rel, title = "default maxbins") | (morley_hist_200 + v_line).facet(row='Expt:N', data=morley_rel, title = "maxBins=200")) & ((morley_hist_70 + v_line).facet(row='Expt:N', data=morley_rel, title = "maxBins=70") | (morley_hist_5 + v_line).facet(row='Expt:N', data=morley_rel, title = "maxBins=5")) - - - +morley_hist_default = alt.Chart().mark_bar(opacity=0.9).encode( + x=alt.X( + "relative_accuracy", + title="Relative Accuracy (%)" + ), + y=alt.Y( + "count()", + stack=False, + title="# Measurements" + ), + color=alt.Color( + "Expt:N", + title="Experiment ID" + ) +).properties(height=100, width=200) + +morley_hist_200 = alt.Chart().mark_bar(opacity=0.9).encode( + x=alt.X( + "relative_accuracy", + bin=alt.Bin(maxbins=200), + title="Relative Accuracy (%)" + ), + y=alt.Y( + "count()", + stack=False, + title="# Measurements" + ), + color=alt.Color( + "Expt:N", title="Experiment ID" + ) +).properties(height=100, width=200) + +morley_hist_70 = alt.Chart().mark_bar(opacity=0.9).encode( + x=alt.X( + "relative_accuracy", + bin=alt.Bin(maxbins=70), + title="Relative Accuracy (%)" + ), + y=alt.Y( + "count()", + stack=False, + title="# Measurements" + ), + color=alt.Color( + "Expt:N", + title="Experiment ID" + ) +).properties(height=100, width=200) + +morley_hist_5 = alt.Chart().mark_bar(opacity=0.9).encode( + x=alt.X( + "relative_accuracy", + bin=alt.Bin(maxbins=5), + title="Relative Accuracy (%)" + ), + y=alt.Y( + "count()", + stack=False, + title="# Measurements" + ), + color=alt.Color( + "Expt:N", + title="Experiment ID" + ) +).properties(height=100, width=200) + +morley_hist_max_bins = (( + (morley_hist_default + v_line).facet(row="Expt:N", data=morley_rel, title="default maxbins") | + (morley_hist_200 + v_line).facet(row="Expt:N", data=morley_rel, title="maxBins=200")) & + ((morley_hist_70 + v_line).facet(row="Expt:N", data=morley_rel, title="maxBins=70") | + (morley_hist_5 + v_line).facet(row="Expt:N", data=morley_rel, title="maxBins=5") +)) ``` ```{code-cell} ipython3 :tags: ["remove-cell"] -glue('final_plot_max_bins', final_plot_max_bins, display=True) +glue("morley_hist_max_bins", morley_hist_max_bins, display=True) ``` -:::{glue:figure} final_plot_max_bins -:figwidth: 700px -:name: final_plot_max_bins +:::{glue:figure} morley_hist_max_bins +:figwidth: 700px +:name: morley_hist_max_bins Effect of varying number of max bins on histograms. ::: -#### Adding layers to a `altair` plot object {-} - -```{index} altair; + -``` - -One of the powerful features of `altair` is that you -can continue to iterate on a single plot object, adding and refining -one layer at a time. If you stored your plot as a named object -using the assignment symbol (`=`), you can -add to it using the `+` operator. -For example, if we wanted to add a vertical line to the last plot we created (`morley_hist`), -we can use the `+` operator to add a vertical line chart layer with the `mark_rule` function. -The result is shown in {numref}`morley_hist_layer`. - -```{code-cell} ipython3 -morley_hist_colored = alt.Chart(morley_df).mark_bar(opacity=0.5).encode( - x = alt.X("Speed"), - y=alt.Y('count()'), - color = "Expt:N") - -v_line = alt.Chart(pd.DataFrame({'x': [792.458]})).mark_rule( - strokeDash=[3,3], size=1).encode( - x='x') -morley_hist_layer = morley_hist_colored + v_line - -``` - -```{code-cell} ipython3 -:tags: ["remove-cell"] -glue('morley_hist_layer', morley_hist_layer, display=True) -``` - -:::{glue:figure} morley_hist_layer -:figwidth: 700px -:name: morley_hist_layer - -Histogram of Michelson's speed of light data colored by experiment with layered vertical line. -::: - - -We can also add a title to the chart by specifying `title` argument in the `alt.Chart` function - - -```{code-cell} ipython3 -morley_hist_title = alt.Chart(morley_df, title = "Histogram of Michelson's speed of light data colored by experiment").mark_bar(opacity=0.5).encode( - x = alt.X("Speed"), - y=alt.Y('count()'), - color = "Expt:N") - - -``` -```{code-cell} ipython3 -:tags: ["remove-cell"] -glue('morley_hist_title', morley_hist_title, display=True) -``` - -:::{glue:figure} morley_hist_title -:figwidth: 700px -:name: morley_hist_title - -Histogram of Michelson's speed of light data colored with title -::: - - -> **Note:** Good visualization titles clearly communicate -> the take home message to the audience. Typically, -> that is the answer to the question you posed before making the visualization. - ## Explaining the visualization -#### *Tell a story* {-} +#### *Tell a story* Typically, your visualization will not be shown entirely on its own, but rather it will be part of a larger presentation. Further, visualizations can provide @@ -1575,24 +1740,24 @@ conclusion. For example, you could use an exploratory visualization in the opening of the presentation to motivate your choice of a more detailed data analysis / model, a visualization of the results of your analysis to show what your analysis has uncovered, or even one at the end of a presentation to help -suggest directions for future work. +suggest directions for future work. ```{index} visualization; explanation ``` Regardless of where it appears, a good way to discuss your visualization is as -a story: +a story: -1) Establish the setting and scope, and describe why you did what you did. +1) Establish the setting and scope, and describe why you did what you did. 2) Pose the question that your visualization answers. Justify why the question is important to answer. -3) Answer the question using your visualization. Make sure you describe *all* aspects of the visualization (including describing the axes). But you +3) Answer the question using your visualization. Make sure you describe *all* aspects of the visualization (including describing the axes). But you can emphasize different aspects based on what is important to answer your question: - **trends (lines):** Does a line describe the trend well? If so, the trend is *linear*, and if not, the trend is *nonlinear*. Is the trend increasing, decreasing, or neither? Is there a periodic oscillation (wiggle) in the trend? Is the trend noisy (does the line "jump around" a lot) or smooth? - **distributions (scatters, histograms):** How spread out are the data? Where are they centered, roughly? Are there any obvious "clusters" or "subgroups", which would be visible as multiple bumps in the histogram? - **distributions of two variables (scatters):** Is there a clear / strong relationship between the variables (points fall in a distinct pattern), a weak one (points fall in a pattern but there is some noise), or no discernible relationship (the data are too noisy to make any conclusion)? - - **amounts (bars):** How large are the bars relative to one another? Are there patterns in different groups of bars? + - **amounts (bars):** How large are the bars relative to one another? Are there patterns in different groups of bars? 4) Summarize your findings, and use them to motivate whatever you will discuss next. Below are two examples of how one might take these four steps in describing the example visualizations that appeared earlier in this chapter. @@ -1601,7 +1766,7 @@ Each of the steps is denoted by its numeral in parentheses, e.g. (3). ```{index} Mauna Loa ``` -**Mauna Loa Atmospheric CO$_{\text{2}}$ Measurements:** (1) Many +**Mauna Loa Atmospheric CO$_{\text{2}}$ Measurements:** (1) Many current forms of energy generation and conversion—from automotive engines to natural gas power plants—rely on burning fossil fuels and produce greenhouse gases, typically primarily carbon dioxide (CO$_{\text{2}}$), as a @@ -1639,7 +1804,7 @@ result. (4) It would be worth further investigating the differences between these experiments to see why they produced different results. ## Saving the visualization -#### *Choose the right output format for your needs* {-} +#### *Choose the right output format for your needs* ```{index} see: bitmap; raster graphics ``` @@ -1653,7 +1818,7 @@ such as file size/type limitations (e.g., if you are submitting your visualization as part of a conference paper or to a poster printing shop) and where it will be displayed (e.g., online, in a paper, on a poster, on a billboard, in talk slides). Generally speaking, images come in two flavors: -*raster* formats +*raster* formats and *vector* formats. ```{index} raster graphics; file types @@ -1667,21 +1832,22 @@ is not noticeable. *Lossless* formats, on the other hand, allow a perfect display of the original image. - *Common file types:* - + - [JPEG](https://en.wikipedia.org/wiki/JPEG) (`.jpg`, `.jpeg`): lossy, usually used for photographs - [PNG](https://en.wikipedia.org/wiki/Portable_Network_Graphics) (`.png`): lossless, usually used for plots / line drawings - + - [BMP](https://en.wikipedia.org/wiki/BMP_file_format) (`.bmp`): lossless, raw image data, no compression (rarely used) + - [TIFF](https://en.wikipedia.org/wiki/TIFF) (`.tif`, `.tiff`): typically lossless, no compression, used mostly in graphic arts, publishing - *Open-source software:* [GIMP](https://www.gimp.org/) ```{index} vector graphics; file types ``` -**Vector** images are represented as a collection of mathematical -objects (lines, surfaces, shapes, curves). When the computer displays the image, it +**Vector** images are represented as a collection of mathematical +objects (lines, surfaces, shapes, curves). When the computer displays the image, it redraws all of the elements using their mathematical formulas. - *Common file types:* - [SVG](https://en.wikipedia.org/wiki/Scalable_Vector_Graphics) (`.svg`): general-purpose use - + - [EPS](https://en.wikipedia.org/wiki/Encapsulated_PostScript) (`.eps`), general-purpose use (rarely used) - *Open-source software:* [Inkscape](https://inkscape.org/) Raster and vector images have opposing advantages and disadvantages. A raster @@ -1693,7 +1859,7 @@ computer has to draw all the elements each time it is displayed. For example, if you have a scatter plot with 1 million points stored as an SVG file, it may take your computer some time to open the image. On the other hand, you can zoom into / scale up vector graphics as much as you like without the image looking -bad, while raster images eventually start to look "pixelated." +bad, while raster images eventually start to look "pixelated." ```{index} PDF ``` @@ -1703,52 +1869,37 @@ bad, while raster images eventually start to look "pixelated." > **Note:** The portable document format [PDF](https://en.wikipedia.org/wiki/PDF) (`.pdf`) is commonly used to > store *both* raster and vector formats. If you try to open a PDF and it's taking a long time -> to load, it may be because there is a complicated vector graphics image that your computer is rendering. - -Let's learn how to save plot images to these different file formats using a -scatter plot of -the [Old Faithful data set](https://www.stat.cmu.edu/~larry/all-of-statistics/=data/faithful.dat) -{cite:p}`faithfuldata`, -shown in {numref}`faithful_scatter_labels` +> to load, it may be because there is a complicated vector graphics image that your computer is rendering. -Now that we have a named `altair` plot object, we can use the `chart.save` function -to save a file containing this image. -`chart.save` works by taking the path to the directory where you would like to save the file -(e.g., `img/filename.png` to save a file named `filename` to the `img` directory), -The kind of image to save is specified by the file extension. -For example, -to create a PNG image file, we specify that the file extension is `.png`. -Below we demonstrate how to save PNG and SVG file types -for the `faithful_scater_labels` plot: +Let's learn how to save plot images to `.png` and `.svg` file formats using the +`faithful_scatter_labels` scatter plot of the [Old Faithful data set](https://www.stat.cmu.edu/~larry/all-of-statistics/=data/faithful.dat) +{cite:p}`faithfuldata` that we created earlier, shown in {numref}`faithful_scatter_labels`. +To save the plot to a file, we can use the `save` +method. The `save` method takes the path to the filename where you would like to +save the file (e.g., `img/filename.png` to save a file named `filename.png` to the `img` directory). +The kind of image to save is specified by the file extension. For example, to +create a PNG image file, we specify that the file extension is `.png`. Below +we demonstrate how to save PNG and SVG file types for the +`faithful_scatter_labels` plot. ```{code-cell} ipython3 -:tags: ["remove-cell"] -!pip install altair_saver -``` - - -```{code-cell} ipython3 -#!pip install altair_saver #uncomment and run in jupyter notebook to install altair_saver, if not already installed from altair_saver import save -faithful_scatter_labels.save("faithful_plot.png") -faithful_scatter_labels.save("faithful_plot.svg") +faithful_scatter_labels.save("img/faithful_plot.png") +faithful_scatter_labels.save("img/faithful_plot.svg") ``` ```{code-cell} ipython3 -:tags: ["remove-cell"] import os -png_size = os.path.getsize("data/faithful_plot.png")/1000000 -svg_size = os.path.getsize("data/faithful_plot.svg")/1000000 +import numpy as np +png_size = np.round(os.path.getsize("img/faithful_plot.png")/(1024*1024), 2) +svg_size = np.round(os.path.getsize("img/faithful_plot.svg")/(1024*1024), 2) -# glue("png_size", png_size) -# glue("svg_size", svg_size) +glue("png_size", png_size) +glue("svg_size", svg_size) ``` - - - ```{list-table} File sizes of the scatter plot of the Old Faithful data set when saved as different file formats. :header-rows: 1 :name: png-vs-svg-table @@ -1758,30 +1909,24 @@ svg_size = os.path.getsize("data/faithful_plot.svg")/1000000 - Image size * - Raster - PNG - - {glue:}`png_size` + - {glue:}`png_size` MB * - Vector - SVG - - {glue:}`svg_size` + - {glue:}`svg_size` MB ``` - - Take a look at the file sizes in {numref}`png-vs-svg-table` -Wow, that's quite a difference! Notice that for such a simple plot with few -graphical elements (points), the vector graphics format (SVG) is over 100 times -smaller than the uncompressed raster images. - -In {numref}`png-vs-svg`, we also show what +Wow, that's quite a difference! In this case, the `.png` image is almost 4 times +smaller than the `.svg` image. Since there are a decent number of points in the plot, +the vector graphics format image (`.svg`) is bigger than the raster image (`.png`), which +just stores the image data itself. +In {numref}`png-vs-svg`, we show what the images look like when we zoom in to a rectangle with only 3 data points. You can see why vector graphics formats are so useful: because they're just based on mathematical formulas, vector graphics can be scaled up to arbitrary sizes. This makes them great for presentation media of all sizes, from papers to posters to billboards. - - - - ```{figure} img/png-vs-svg.png --- height: 400px @@ -1792,15 +1937,15 @@ Zoomed in `faithful`, raster (PNG, left) and vector (SVG, right) formats. ## Exercises -Practice exercises for the material covered in this chapter -can be found in the accompanying -[worksheets repository](https://github.com/UBC-DSCI/data-science-a-first-intro-worksheets#readme) +Practice exercises for the material covered in this chapter +can be found in the accompanying +[worksheets repository](https://github.com/UBC-DSCI/data-science-a-first-intro-python-worksheets#readme) in the "Effective data visualization" row. You can launch an interactive version of the worksheet in your browser by clicking the "launch binder" button. You can also preview a non-interactive version of the worksheet by clicking "view worksheet." If you instead decide to download the worksheet and run it on your own machine, make sure to follow the instructions for computer setup -found in Chapter \@ref(move-to-your-own-machine). This will ensure that the automated feedback +found in the {ref}`move-to-your-own-machine` chapter. This will ensure that the automated feedback and guidance that the worksheets provide will function as intended. ## Additional resources @@ -1814,7 +1959,7 @@ and guidance that the worksheets provide will function as intended. a wealth of information on designing effective visualizations. It is not specific to any particular programming language or library. If you want to improve your visualization skills, this is the next place to look. -- The [dates and times](https://wesmckinney.com/book/time-series.html){cite:p}`mckinney2012python` +- The [dates and times](https://wesmckinney.com/book/time-series.html){cite:p}`mckinney2012python` chapter is where you should look if you want to learn about `date` and `time`, including how to create them, and how to use them to effectively handle durations, etc ## References diff --git a/source/wrangling.md b/source/wrangling.md index 4f0a4573..d9e33cc1 100644 --- a/source/wrangling.md +++ b/source/wrangling.md @@ -41,71 +41,33 @@ By the end of the chapter, readers will be able to do the following: - Define the term "tidy data". - Discuss the advantages of storing data in a tidy data format. - - Define what lists, series and data frames are in Python, and describe how they relate to + - Define what series and data frames are in Python, and describe how they relate to each other. - Describe the common types of data in Python and their uses. - Recall and use the following functions for their intended data wrangling tasks: - - `.agg` - - `.apply` - - `.assign` - - `.groupby` - - `.melt` - - `.pivot` - - `.str.split` + - `agg` + - `apply` + - `assign` + - `groupby` + - `melt` + - `pivot` + - `str.split` - Recall and use the following operators for their intended data wrangling tasks: - - `==` + - `==` - `in` - `and` - `or` - - `df[]` - - `.iloc[]` - - `.loc[]` + - `[]` + - `loc[]` + - `iloc[]` -```{code-cell} ipython3 ---- -jupyter: - source_hidden: true -tags: [remove-cell] ---- -# By the end of the chapter, readers will be able to do the following: - -# - Define the term "tidy data". -# - Discuss the advantages of storing data in a tidy data format. -# - Define what vectors, lists, and data frames are in R, and describe how they relate to -# each other. -# - Describe the common types of data in R and their uses. -# - Recall and use the following functions for their -# intended data wrangling tasks: -# - `across` -# - `c` -# - `filter` -# - `group_by` -# - `select` -# - `map` -# - `mutate` -# - `pull` -# - `pivot_longer` -# - `pivot_wider` -# - `rowwise` -# - `separate` -# - `summarize` -# - Recall and use the following operators for their -# intended data wrangling tasks: -# - `==` -# - `%in%` -# - `!` -# - `&` -# - `|` -# - `|>` and `%>%` -``` - -## Data frames, series, and lists - -In Chapters {ref}`intro` and {ref}`reading`, *data frames* were the focus: +## Data frames and series + +In the chapters on {ref}`intro` and {ref}`reading`, *data frames* were the focus: we learned how to import data into Python as a data frame, and perform basic operations on data frames in Python. -In the remainder of this book, this pattern continues. The vast majority of tools we use will require +In the remainder of this book, this pattern continues. The vast majority of tools we use will require that data are represented as a `pandas` **data frame** in Python. Therefore, in this section, we will dig more deeply into what data frames are and how they are represented in Python. This knowledge will be helpful in effectively utilizing these objects in our data analyses. @@ -147,46 +109,31 @@ data set. There are 13 entities in the data set in total, corresponding to the ```{figure} img/data_frame_slides_cdn/data_frame_slides_cdn.004.jpeg :name: fig:02-obs -:figclass: caption-hack +:figclass: figure A data frame storing data regarding the population of various regions in Canada. In this example data frame, the row that corresponds to the observation for the city of Vancouver is colored yellow, and the column that corresponds to the population variable is colored blue. ``` -```{code-cell} ipython3 -:tags: [remove-cell] - -# The following cell was removed because there is no "vector" in Python. -``` - -+++ {"tags": ["remove-cell"]} - -Python stores the columns of a data frame as either -*lists* or *vectors*. For example, the data frame in Figure -{numref}`fig:02-vectors` has three vectors whose names are `region`, `year` and -`population`. The next two sections will explain what lists and vectors are. - -```{figure} img/data_frame_slides_cdn/data_frame_slides_cdn.005.jpeg -:name: fig:02-vectors -:figclass: caption-hack - -Data frame with three vectors. -``` - -+++ - ### What is a series? ```{index} pandas.Series ``` -In Python, `pandas` **series** are arrays with labels. They are strictly 1-dimensional and can contain any data type (integers, strings, floats, etc), including a mix of them (objects); -Python has several different basic data types, as shown in {numref}`tab:datatype-table`. -You can create a `pandas` series using the `pd.Series()` function. For -example, to create the vector `region` as shown in -{numref}`fig:02-series`, you can write: +In Python, `pandas` **series** are are objects that can contain one or more elements (like a list). +They are a single column, are ordered, can be indexed, and can contain any data type. +The `pandas` package uses `Series` objects to represent the columns in a data frame. +`Series` can contain a mix of data types, but it is good practice to only include a single type in a series +because all observations of one variable should be the same type. +Python +has several different basic data types, as shown in +{numref}`tab:datatype-table`. +You can create a `pandas` series using the +`pd.Series()` function. For example, to create the series `region` as shown +in {numref}`fig:02-series`, you can write the following. ```{code-cell} ipython3 import pandas as pd + region = pd.Series(["Toronto", "Montreal", "Vancouver", "Calgary", "Ottawa"]) region ``` @@ -195,46 +142,11 @@ region ```{figure} img/wrangling/pandas_dataframe_series.png :name: fig:02-series -:figclass: caption-hack +:figclass: figure Example of a `pandas` series whose type is string. ``` -+++ {"tags": ["remove-cell"]} - -### What is a vector? - -In R, **vectors** \index{vector}\index{atomic vector|see{vector}} are objects that can contain one or more elements. The vector -elements are ordered, and they must all be of the same **data type**; -R has several different basic data types, as shown in {numref}`tab:datatype-table`. -Figure \@ref(fig:02-vector) provides an example of a vector where all of the elements are -of character type. -You can create vectors in R using the `c` function \index{c function} (`c` stands for "concatenate"). For -example, to create the vector `region` as shown in Figure -\@ref(fig:02-vector), you would write: - -``` {r} -year <- c("Toronto", "Montreal", "Vancouver", "Calgary", "Ottawa") -year -``` - -> **Note:** Technically, these objects are called "atomic vectors." In this book -> we have chosen to call them "vectors," which is how they are most commonly -> referred to in the R community. To be totally precise, "vector" is an umbrella term that -> encompasses both atomic vector and list objects in R. But this creates a -> confusing situation where the term "vector" could -> mean "atomic vector" *or* "the umbrella term for atomic vector and list," -> depending on context. Very confusing indeed! So to keep things simple, in -> this book we *always* use the term "vector" to refer to "atomic vector." -> We encourage readers who are enthusiastic to learn more to read the -> Vectors chapter of *Advanced R* [@wickham2019advanced]. - -``` {r 02-vector, echo = FALSE, message = FALSE, warning = FALSE, fig.cap = "Example of a vector whose type is character.", fig.retina = 2, out.width = "100%"} -image_read("img/data_frame_slides_cdn/data_frame_slides_cdn.007.jpeg") %>% - image_crop("3632x590") -``` - -+++ ```{code-cell} ipython3 :tags: [remove-cell] @@ -265,76 +177,30 @@ image_read("img/data_frame_slides_cdn/data_frame_slides_cdn.007.jpeg") %>% ```{table} Basic data types in Python :name: tab:datatype-table -| English name | Type name | Type Category | Description | Example | -| :-------------------- | :--------- | :------------- | :-------------------------------------------- | :----------------------------------------- | -| integer | `int` | Numeric Type | positive/negative whole numbers | `42` | -| floating point number | `float` | Numeric Type | real number in decimal form | `3.14159` | -| boolean | `bool` | Boolean Values | true or false | `True` | -| string | `str` | Sequence Type | text | `"Can I have a cheezburger?"` | -| list | `list` | Sequence Type | a collection of objects - mutable & ordered | `['Ali', 'Xinyi', 'Miriam']` | -| tuple | `tuple` | Sequence Type | a collection of objects - immutable & ordered | `('Thursday', 6, 9, 2018)` | -| dictionary | `dict` | Mapping Type | mapping of key-value pairs | `{'name':'DSCI', 'code':100, 'credits':2}` | -| none | `NoneType` | Null Object | represents no value | `None` | +| Data type | Abbreviation | Description | Example | +| :-------------------- | :----------- | :-------------------------------------------- | :----------------------------------------- | +| integer | `int` | positive/negative/zero whole numbers | `42` | +| floating point number | `float` | real number in decimal form | `3.14159` | +| boolean | `bool` | true or false | `True` | +| string | `str` | text | `"Hello World"` | +| none | `NoneType` | represents no value | `None` | ``` +++ -It is important in Python to make sure you represent your data with the correct type. -Many of the `pandas` functions we use in this book treat -the various data types differently. You should use integers and float types -(which both fall under the "numeric" umbrella type) to represent numbers and perform -arithmetic. Strings are used to represent data that should -be thought of as "text", such as words, names, paths, URLs, and more. -There are other basic data types in Python, such as *set* -and *complex*, but we do not use these in this textbook. - -```{code-cell} ipython3 -:tags: [remove-cell] - -# It is important in R to make sure you represent your data with the correct type. -# Many of the `tidyverse` functions we use in this book treat -# the various data types differently. You should use integers and double types -# (which both fall under the "numeric" umbrella type) to represent numbers and perform -# arithmetic. Doubles are more common than integers in R, though; for instance, a double data type is the -# default when you create a vector of numbers using `c()`, and when you read in -# whole numbers via `read_csv`. Characters are used to represent data that should -# be thought of as "text", such as words, names, paths, URLs, and more. Factors help us -# encode variables that represent *categories*; a factor variable takes one of a discrete -# set of values known as *levels* (one for each category). The levels can be ordered or unordered. Even though -# factors can sometimes *look* like characters, they are not used to represent -# text, words, names, and paths in the way that characters are; in fact, R -# internally stores factors using integers! There are other basic data types in R, such as *raw* -# and *complex*, but we do not use these in this textbook. -``` - -### What is a list? +It is important in Python to make sure you represent your data with the correct type. +Many of the `pandas` functions we use in this book treat +the various data types differently. You should use `int` and `float` types +to represent numbers and perform arithmetic. The `int` type is for integers that have no decimal point, +while the `float` type is for numbers that have a decimal point. +The `bool` type are boolean variables that can only take on one of two values: `True` or `False`. +The `string` type is used to represent data that should +be thought of as "text", such as words, names, paths, URLs, and more. +A `NoneType` is a special type in Python that is used to indicate no value; this can occur, +for example, when you have missing data. +There are other basic data types in Python, but we will generally +not use these in this textbook. -```{index} list -``` - -Lists are built-in objects in Python that have multiple, ordered elements. -`pandas` series can be treated as lists with labels (indices). - -```{code-cell} ipython3 -:tags: [remove-cell] - -# Lists \index{list} are also objects in R that have multiple, ordered elements. -# Vectors and lists differ by the requirement of element type -# consistency. All elements within a single vector must be of the same type (e.g., -# all elements are characters), whereas elements within a single list can be of -# different types (e.g., characters, integers, logicals, and even other lists). -``` - -+++ {"tags": ["remove-cell"]} - -```{figure} img/data_frame_slides_cdn/data_frame_slides_cdn.008.jpeg -:name: fig:02-vec-vs-list -:figclass: caption-hack - -A vector versus a list. -``` - -+++ ### What does this have to do with data frames? @@ -343,41 +209,26 @@ A vector versus a list. ```{index} data frame; definition ``` -A data frame is really just series stuck together that follows two rules: - -1. Each element itself is a series. -2. Each element (series) must have the same length. - -Not all columns in a data frame need to be of the same type. +A data frame is really just a collection of series that are stuck together, +where each series corresponds to one column and all must have the same length. +But not all columns in a data frame need to be of the same type. {numref}`fig:02-dataframe` shows a data frame where -the columns are series of different types. +the columns are series of different types. But each element *within* +one column should usually be the same type, since the values for a single variable +are usually all of the same type. For example, if the variable is the name of a city, +that name should be a string, whereas if the variable is a year, that should be an +integer. So even though series let you put different types in them, it is most common +(and good practice!) to have just one type per column. +++ {"tags": []} ```{figure} img/wrangling/pandas_dataframe_series-3.png :name: fig:02-dataframe -:figclass: caption-hack +:figclass: figure -Data frame and vector types. +Data frame and series types. ``` -```{code-cell} ipython3 -:tags: [remove-cell] - -# A data frame \index{data frame!definition} is really a special kind of list that follows two rules: - -# 1. Each element itself must either be a vector or a list. -# 2. Each element (vector or list) must have the same length. - -# Not all columns in a data frame need to be of the same type. -# Figure \@ref(fig:02-dataframe) shows a data frame where -# the columns are vectors of different types. -# But remember: because the columns in this example are *vectors*, -# the elements must be the same data type *within each column.* -# On the other hand, if our data frame had *list* columns, there would be no such requirement. -# It is generally much more common to use *vector* columns, though, -# as the values for a single variable are usually all of the same type. -``` ```{index} type ``` @@ -386,46 +237,72 @@ Data frame and vector types. > For example we can check the class of the Canadian languages data set, > `can_lang`, we worked with in the previous chapters and we see it is a `pandas.core.frame.DataFrame`. -```{code-cell} ipython3 -:tags: [remove-cell] - -# The functions from the `tidyverse` package that we use often give us a -# special class of data frame called a *tibble*. Tibbles have some additional \index{tibble} -# features and benefits over the built-in data frame object. These include the -# ability to add useful attributes (such as grouping, which we will discuss later) -# and more predictable type preservation when subsetting. -# Because a tibble is just a data frame with some added features, -# we will collectively refer to both built-in R data frames and -# tibbles as data frames in this book. - -# > **Note:** You can use the function `class` \index{class} on a data object to assess whether a data -# > frame is a built-in R data frame or a tibble. If the data object is a data -# > frame, `class` will return `"data.frame"`. If the data object is a -# > tibble it will return `"tbl_df" "tbl" "data.frame"`. You can easily convert -# > built-in R data frames to tibbles using the `tidyverse` `as_tibble` function. -# > For example we can check the class of the Canadian languages data set, -# > `can_lang`, we worked with in the previous chapters and we see it is a tibble. -``` ```{code-cell} ipython3 can_lang = pd.read_csv("data/can_lang.csv") type(can_lang) ``` -Lists, Series and DataFrames are basic types of *data structure* in Python, which -are core to most data analyses. We summarize them in -{numref}`tab:datastructure-table`. There are several other data structures in the Python programming -language (*e.g.,* matrices), but these are beyond the scope of this book. +### Data structures in Python -+++ +The `Series` and `DataFrame` types are *data structures* in Python, which +are core to most data analyses. +The functions from `pandas` that we use often give us back a `DataFrame` +or a `Series` depending on the operation. Because +`Series` are essentially simple `DataFrames`, we will refer +to both `DataFrames` and `Series` as "data frames" in the text. +There are other types that represent data structures in Python. +We summarize the most common ones in {numref}`tab:datastruc-table`. ```{table} Basic data structures in Python -:name: tab:datastructure-table +:name: tab:datastruc-table | Data Structure | Description | -| --- |------------ | -| list | An 1D ordered collection of values that can store multiple data types at once. | -| Series | An 1D ordered collection of values *with labels* that can store multiple data types at once. | -| DataFrame | A 2D labeled data structure with columns of potentially different types. | +| --- | ----------- | +| list | An ordered collection of values that can store multiple data types at once. | +| dict | A labeled data structure where `keys` are paired with `values` | +| Series | An ordered collection of values *with labels* that can store multiple data types at once. | +| DataFrame | A labeled data structure with `Series` columns of potentially different types. | +``` + +A `list` is an ordered collection of values. To create a list, we put the contents of the list in between +square brackets `[]`, where each item of the list is separated by a comma. A `list` can contain values +of different types. The example below contains six `str` entries. + +```{code-cell} ipython3 +cities = ["Toronto", "Vancouver", "Montreal", "Calgary", "Ottawa", "Winnipeg"] +cities +``` +A list can directly be converted to a pandas `Series`. +```{code-cell} ipython3 +cities_series = pd.Series(cities) +cities_series +``` + +A `dict`, or dictionary, contains pairs of "keys" and "values." +You use a key to look up its corresponding value. Dictionaries are created +using curly brackets `{}`. Each entry starts with the +key on the left, followed by a colon symbol `:`, and then the value. +A dictionary can have multiple key-value pairs, each separted by a comma. +Keys can take a wide variety of types (`int` and `str` are commonly used), and values can take any type; +the key-value pairs in a dictionary can all be of different types, too. + In the example below, +we create a dictionary that has two keys: `"cities"` and `"population"`. +The values associated with each are lists. +```{code-cell} ipython3 +population_in_2016 = { + "cities": ["Toronto", "Vancouver", "Montreal", "Calgary", "Ottawa", "Winnipeg"], + "population": [2235145, 1027613, 1823281, 544870, 571146, 321484] +} +population_in_2016 +``` +A dictionary can be converted to a data frame. Keys +become the column names, and the values become the entries in +those columns. Dictionaries on their own are quite simple objects; it is preferable to work with a data frame +because then we have access to the built-in functionality in +`pandas` (e.g. `loc[]`, `[]`, and many functions that we will discuss in the upcoming sections)! +```{code-cell} ipython3 +population_in_2016 = pd.DataFrame(population_in_2016) +population_in_2016 ``` +++ @@ -435,9 +312,10 @@ language (*e.g.,* matrices), but these are beyond the scope of this book. ```{index} tidy data; definition ``` -There are many ways a tabular data set can be organized. This chapter will focus -on introducing the **tidy data** format of organization and how to make your raw -(and likely messy) data tidy. A tidy data frame satisfies +There are many ways a tabular data set can be organized. The data frames we +have looked at so far have all been using the **tidy data** format of +organization. This chapter will focus on introducing the tidy data format and +how to make your raw (and likely messy) data tidy. A tidy data frame satisfies the following three criteria {cite:p}`wickham2014tidy`: - each row is a single observation, @@ -445,14 +323,14 @@ the following three criteria {cite:p}`wickham2014tidy`: - each value is a single cell (i.e., its entry in the data frame is not shared with another value). -{numref}`fig:02-tidy-image` demonstrates a tidy data set that satisfies these +{numref}`fig:02-tidy-image` demonstrates a tidy data set that satisfies these three criteria. +++ {"tags": []} ```{figure} img/tidy_data/tidy_data.001-cropped.jpeg :name: fig:02-tidy-image -:figclass: caption-hack +:figclass: figure Tidy data satisfies three criteria. ``` @@ -464,8 +342,8 @@ Tidy data satisfies three criteria. There are many good reasons for making sure your data are tidy as a first step in your analysis. The most important is that it is a single, consistent format that nearly every function -in the `pandas` recognizes. No matter what the variables and observations -in your data represent, as long as the data frame +in the `pandas` recognizes. No matter what the variables and observations +in your data represent, as long as the data frame is tidy, you can manipulate it, plot it, and analyze it using the same tools. If your data is *not* tidy, you will have to write special bespoke code in your analysis that will not only be error-prone, but hard for others to understand. @@ -486,23 +364,23 @@ below! +++ -### Tidying up: going from wide to long using `.melt` +### Tidying up: going from wide to long using `melt` ```{index} pandas.DataFrame; melt ``` -One task that is commonly performed to get data into a tidy format -is to combine values that are stored in separate columns, +One task that is commonly performed to get data into a tidy format +is to combine values that are stored in separate columns, but are really part of the same variable, into one. -Data is often stored this way -because this format is sometimes more intuitive for human readability +Data is often stored this way +because this format is sometimes more intuitive for human readability and understanding, and humans create data sets. -In {numref}`fig:02-wide-to-long`, -the table on the left is in an untidy, "wide" format because the year values -(2006, 2011, 2016) are stored as column names. -And as a consequence, -the values for population for the various cities -over these years are also split across several columns. +In {numref}`fig:02-wide-to-long`, +the table on the left is in an untidy, "wide" format because the year values +(2006, 2011, 2016) are stored as column names. +And as a consequence, +the values for population for the various cities +over these years are also split across several columns. For humans, this table is easy to read, which is why you will often find data stored in this wide format. However, this format is difficult to work with @@ -518,19 +396,24 @@ greatly simplified once the data is tidied. Another problem with data in this format is that we don't know what the numbers under each year actually represent. Do those numbers represent -population size? Land area? It's not clear. -To solve both of these problems, -we can reshape this data set to a tidy data format +population size? Land area? It's not clear. +To solve both of these problems, +we can reshape this data set to a tidy data format by creating a column called "year" and a column called "population." This transformation—which makes the data "longer"—is shown as the right table in -{numref}`fig:02-wide-to-long`. +{numref}`fig:02-wide-to-long`. Note that the number of entries in our data frame +can change in this transformation. The "untidy" data has 5 rows and 3 columns for +a total of 15 entries, whereas the "tidy" data on the right has 15 rows and 2 columns +for a total of 30 entries. +++ {"tags": []} ```{figure} img/pivot_functions/pivot_functions.001.jpeg :name: fig:02-wide-to-long -:figclass: caption-hack +:figclass: figure + + Melting data from a wide to long data format. ``` @@ -540,63 +423,64 @@ Melting data from a wide to long data format. ```{index} Canadian languages ``` -We can achieve this effect in Python using the `.melt` function from the `pandas` package. -The `.melt` function combines columns, -and is usually used during tidying data -when we need to make the data frame longer and narrower. -To learn how to use `.melt`, we will work through an example with the +We can achieve this effect in Python using the `melt` function from the `pandas` package. +The `melt` function combines columns, +and is usually used during tidying data +when we need to make the data frame longer and narrower. +To learn how to use `melt`, we will work through an example with the `region_lang_top5_cities_wide.csv` data set. This data set contains the -counts of how many Canadians cited each language as their mother tongue for five +counts of how many Canadians cited each language as their mother tongue for five major Canadian cities (Toronto, MontrĆ©al, Vancouver, Calgary and Edmonton) from -the 2016 Canadian census. -To get started, +the 2016 Canadian census. +To get started, we will use `pd.read_csv` to load the (untidy) data. ```{code-cell} ipython3 +:tags: ["output_scroll"] lang_wide = pd.read_csv("data/region_lang_top5_cities_wide.csv") lang_wide ``` -What is wrong with the untidy format above? -The table on the left in {numref}`fig:img-pivot-longer-with-table` +What is wrong with the untidy format above? +The table on the left in {numref}`fig:img-pivot-longer-with-table` represents the data in the "wide" (messy) format. -From a data analysis perspective, this format is not ideal because the values of -the variable *region* (Toronto, MontrĆ©al, Vancouver, Calgary and Edmonton) +From a data analysis perspective, this format is not ideal because the values of +the variable *region* (Toronto, MontrĆ©al, Vancouver, Calgary and Edmonton) are stored as column names. Thus they are not easily accessible to the data analysis functions we will apply to our data set. Additionally, the *mother tongue* variable values are spread across multiple columns, which will prevent us from doing any desired visualization or statistical tasks until we combine them into one column. For -instance, suppose we want to know the languages with the highest number of +instance, suppose we want to know the languages with the highest number of Canadians reporting it as their mother tongue among all five regions. This -question would be tough to answer with the data in its current format. -We *could* find the answer with the data in this format, +question would be tough to answer with the data in its current format. +We *could* find the answer with the data in this format, though it would be much easier to answer if we tidy our -data first. If mother tongue were instead stored as one column, -as shown in the tidy data on the right in +data first. If mother tongue were instead stored as one column, +as shown in the tidy data on the right in {numref}`fig:img-pivot-longer-with-table`, -we could simply use one line of code (`df["mother_tongue"].max()`) +we could simply use one line of code (`df["mother_tongue"].max()`) to get the maximum value. +++ {"tags": []} ```{figure} img/wrangling/pandas_melt_wide-long.png :name: fig:img-pivot-longer-with-table -:figclass: caption-hack +:figclass: figure -Going from wide to long with the `.melt` function. +Going from wide to long with the `melt` function. ``` +++ -{numref}`fig:img-pivot-longer` details the arguments that we need to specify -in the `.melt` function to accomplish this data transformation. +{numref}`fig:img-pivot-longer` details the arguments that we need to specify +in the `melt` function to accomplish this data transformation. +++ {"tags": []} ```{figure} img/wrangling/pandas_melt_args_labels.png :name: fig:img-pivot-longer -:figclass: caption-hack +:figclass: figure Syntax for the `melt` function. ``` @@ -609,29 +493,29 @@ Syntax for the `melt` function. ```{index} see: :; column range ``` -We use `.melt` to combine the Toronto, MontrĆ©al, +We use `melt` to combine the Toronto, MontrĆ©al, Vancouver, Calgary, and Edmonton columns into a single column called `region`, and create a column called `mother_tongue` that contains the count of how many Canadians report each language as their mother tongue for each metropolitan -area. We specify `value_vars` to be all -the columns between Toronto and Edmonton: +area ```{code-cell} ipython3 +:tags: ["output_scroll"] lang_mother_tidy = lang_wide.melt( id_vars=["category", "language"], - value_vars=["Toronto", "MontrĆ©al", "Vancouver", "Calgary", "Edmonton"], var_name="region", value_name="mother_tongue", ) - lang_mother_tidy ``` > **Note**: In the code above, the call to the -> `.melt` function is split across several lines. This is allowed in -> certain cases; for example, when calling a function as above, as long as the -> line ends with a comma `,` Python knows to keep reading on the next line. -> Splitting long lines like this across multiple lines is encouraged +> `melt` function is split across several lines. Recall from +> the {ref}`intro` chapter that this is allowed in +> certain cases. For example, when calling a function as above, the input +> arguments are between parentheses `()` and Python knows to keep reading on +> the next line. Each line ends with a comma `,` making it easier to read. +> Splitting long lines like this across multiple lines is encouraged > as it helps significantly with code readability. Generally speaking, you should > limit each line of code to about 80 characters. @@ -648,7 +532,7 @@ been met: +++ (pivot-wider)= -### Tidying up: going from long to wide using `.pivot` +### Tidying up: going from long to wide using `pivot` ```{index} pandas.DataFrame; pivot ``` @@ -656,17 +540,17 @@ been met: Suppose we have observations spread across multiple rows rather than in a single row. For example, in {numref}`fig:long-to-wide`, the table on the left is in an untidy, long format because the `count` column contains three variables -(population, commuter, and incorporated count) and information about each observation -(here, population, commuter, and incorporated counts for a region) is split across three rows. -Remember: one of the criteria for tidy data +(population, commuter, and incorporated count) and information about each observation +(here, population, commuter, and incorporated counts for a region) is split across three rows. +Remember: one of the criteria for tidy data is that each observation must be in a single row. Using data in this format—where two or more variables are mixed together in a single column—makes it harder to apply many usual `pandas` functions. -For example, finding the maximum number of commuters +For example, finding the maximum number of commuters would require an additional step of filtering for the commuter values before the maximum can be computed. -In comparison, if the data were tidy, +In comparison, if the data were tidy, all we would have to do is compute the maximum value for the commuter column. To reshape this untidy data set to a tidy (and in this case, wider) format, we need to create columns called "population", "commuters", and "incorporated." @@ -676,62 +560,64 @@ This is illustrated in the right table of {numref}`fig:long-to-wide`. ```{figure} img/pivot_functions/pivot_functions.002.jpeg :name: fig:long-to-wide -:figclass: caption-hack +:figclass: figure Going from long to wide data. ``` +++ -To tidy this type of data in Python, we can use the `.pivot` function. -The `.pivot` function generally increases the number of columns (widens) -and decreases the number of rows in a data set. -To learn how to use `.pivot`, -we will work through an example -with the `region_lang_top5_cities_long.csv` data set. -This data set contains the number of Canadians reporting +To tidy this type of data in Python, we can use the `pivot` function. +The `pivot` function generally increases the number of columns (widens) +and decreases the number of rows in a data set. +To learn how to use `pivot`, +we will work through an example +with the `region_lang_top5_cities_long.csv` data set. +This data set contains the number of Canadians reporting the primary language at home and work for five major cities (Toronto, MontrĆ©al, Vancouver, Calgary and Edmonton). ```{code-cell} ipython3 +:tags: ["output_scroll"] lang_long = pd.read_csv("data/region_lang_top5_cities_long.csv") lang_long ``` -What makes the data set shown above untidy? -In this example, each observation is a language in a region. -However, each observation is split across multiple rows: -one where the count for `most_at_home` is recorded, -and the other where the count for `most_at_work` is recorded. -Suppose the goal with this data was to +What makes the data set shown above untidy? +In this example, each observation is a language in a region. +However, each observation is split across multiple rows: +one where the count for `most_at_home` is recorded, +and the other where the count for `most_at_work` is recorded. +Suppose the goal with this data was to visualize the relationship between the number of -Canadians reporting their primary language at home and work. +Canadians reporting their primary language at home and work. Doing that would be difficult with this data in its current form, since these two variables are stored in the same column. {numref}`fig:img-pivot-wider-table` shows how this data -will be tidied using the `.pivot` function. +will be tidied using the `pivot` function. +++ {"tags": []} ```{figure} img/wrangling/pandas_pivot_long-wide.png :name: fig:img-pivot-wider-table -:figclass: caption-hack +:figclass: figure -Going from long to wide with the `.pivot` function. +Going from long to wide with the `pivot` function. ``` +++ -{numref}`fig:img-pivot-wider` details the arguments that we need to specify -in the `.pivot` function. +{numref}`fig:img-pivot-wider` details the arguments that we need to specify in the `pivot` function. + +**TODO make figure match code below** +++ {"tags": []} ```{figure} img/wrangling/pandas_pivot_args_labels.png :name: fig:img-pivot-wider -:figclass: caption-hack +:figclass: figure -Syntax for the `.pivot` function. +Syntax for the `pivot` function. ``` +++ @@ -739,8 +625,11 @@ Syntax for the `.pivot` function. We will apply the function as detailed in {numref}`fig:img-pivot-wider`. ```{code-cell} ipython3 +:tags: ["output_scroll"] lang_home_tidy = lang_long.pivot( - index=["region", "category", "language"], columns=["type"], values=["count"] + index=["region", "category", "language"], + columns=["type"], + values=["count"] ).reset_index() lang_home_tidy.columns = [ @@ -753,11 +642,30 @@ lang_home_tidy.columns = [ lang_home_tidy ``` +In the first step, note that we added a call to `reset_index`. When `pivot` is called with +multiple column names passed to the `index`, those entries become the "name" of each row that +would be used when you filter rows with `[]` or `loc` rather than just simple numbers. This +can be confusing... What `reset_index` does is sets us back with the usual expected behaviour +where each row is "named" with an integer. This is a subtle point, but the main take-away is that +when you call `pivot`, it is a good idea to call `reset_index` afterwards. + +The second operation we applied is to rename the columns. When we perform the `pivot` +operation, it keeps the original column name `"count"` and adds the `"type"` as a second column name. +Having two names for a column can be confusing! So we rename giving each column only one name. + +We can print out some useful information about our data frame using the `info` function. +In the first row it tells us the `type` of `lang_home_tidy` (it is a `pandas` `DataFrame`). The second +row tells us how many rows there are: 1070, and to index those rows, you can use numbers between +0 and 1069 (remember that Python starts counting at 0!). Next, there is a print out about the data +colums. Here there are 5 columns total. The little table it prints out tells you the name of each +column, the number of non-null values (e.g. the number of entries that are not missing values), and +the type of the entries. Finally the last two rows summarize the types of each column and how much +memory the data frame is using on your computer. ```{code-cell} ipython3 -lang_home_tidy.dtypes +lang_home_tidy.info() ``` -The data above is now tidy! We can go through the three criteria again to check +The data is now tidy! We can go through the three criteria again to check that this data is a tidy data set. 1. All the statistical variables are their own columns in the data frame (i.e., @@ -768,43 +676,45 @@ that this data is a tidy data set. frame is not shared with another value). You might notice that we have the same number of columns in the tidy data set as -we did in the messy one. Therefore `.pivot` didn't really "widen" the data. +we did in the messy one. Therefore `pivot` didn't really "widen" the data. This is just because the original `type` column only had -two categories in it. If it had more than two, `.pivot` would have created +two categories in it. If it had more than two, `pivot` would have created more columns, and we would see the data set "widen." + +++ (str-split)= -### Tidying up: using `.str.split` to deal with multiple delimiters +### Tidying up: using `str.split` to deal with multiple delimiters ```{index} pandas.Series; str.split, delimiter ``` -Data are also not considered tidy when multiple values are stored in the same +Data are also not considered tidy when multiple values are stored in the same cell. The data set we show below is even messier than the ones we dealt with above: the `Toronto`, `MontrĆ©al`, `Vancouver`, `Calgary` and `Edmonton` columns contain the number of Canadians reporting their primary language at home and -work in one column separated by the delimiter (`/`). The column names are the +work in one column separated by the delimiter (`/`). The column names are the values of a variable, *and* each value does not have its own cell! To turn this messy data into tidy data, we'll have to fix these issues. ```{code-cell} ipython3 +:tags: ["output_scroll"] lang_messy = pd.read_csv("data/region_lang_top5_cities_messy.csv") lang_messy ``` -First we’ll use `.melt` to create two columns, `region` and `value`, -similar to what we did previously. +First we’ll use `melt` to create two columns, `region` and `value`, +similar to what we did previously. The new `region` columns will contain the region names, -and the new column `value` will be a temporary holding place for the -data that we need to further separate, i.e., the +and the new column `value` will be a temporary holding place for the +data that we need to further separate, i.e., the number of Canadians reporting their primary language at home and work. ```{code-cell} ipython3 +:tags: ["output_scroll"] lang_messy_longer = lang_messy.melt( id_vars=["category", "language"], - value_vars=["Toronto", "MontrĆ©al", "Vancouver", "Calgary", "Edmonton"], var_name="region", value_name="value", ) @@ -812,40 +722,73 @@ lang_messy_longer = lang_messy.melt( lang_messy_longer ``` -Next we'll use `.str.split` to split the `value` column into two columns. -One column will contain only the counts of Canadians -that speak each language most at home, -and the other will contain the counts of Canadians -that speak each language most at work for each region. +Next we'll split the `value` column into two columns. +In basic Python, if we wanted to split the string `"50/0"` into two numbers `["50", "0"]` +we would use the `split` method on the string, and specify that the split should be made +on the slash character `"/"`. +```{code-cell} ipython3 +"50/0".split("/") +``` + +The `pandas` package provides similar functions that we can access +by using the `str` method. So, to split all of the entries for an entire +column in a data frame, we would use the `str.split` method. +Once we use this method, +one column will contain only the counts of Canadians +that speak each language most at home, +and the other will contain the counts of Canadians +that speak each language most at work for each region. {numref}`fig:img-separate` -outlines what we need to specify to use `.str.split`. +outlines what we need to specify to use `str.split`. +++ {"tags": []} ```{figure} img/wrangling/str-split_args_labels.png :name: fig:img-separate -:figclass: caption-hack +:figclass: figure -Syntax for the `.str.split` function. +Syntax for the `str.split` function. ``` +We will do this in multiple steps. First, we create a new object +that contains two columns. We will set the `expand` argument to `True` +to tell `pandas` that we want to expand the output into two columns. + ```{code-cell} ipython3 -tidy_lang = ( - pd.concat( - (lang_messy_longer, lang_messy_longer["value"].str.split("/", expand=True)), - axis=1, - ) - .rename(columns={0: "most_at_home", 1: "most_at_work"}) - .drop(columns=["value"]) -) +split_counts = lang_messy_longer["value"].str.split("/", expand=True) +split_counts +``` +Since we only operated on the `value` column, the `split_counts` data frame +doesn't have the rest of the columns (`language`, `region`, etc.) +that were in our original data frame. We don't want to lose this information, so +we will contatenate (combine) the original data frame with `split_counts` using +the `concat` function from `pandas`. The `concat` function *concatenates* data frames +along an axis. By default, it concatenates the data frames vertically along `axis=0` yielding a single +*taller* data frame. Since we want to concatenate our old columns to our +new `split_counts` data frame (to obtain a *wider* data frame), we will specify `axis=1`. +```{code-cell} ipython3 +:tags: ["output_scroll"] +tidy_lang = pd.concat( + [lang_messy_longer, split_counts], + axis=1, +) tidy_lang ``` +Next, we will rename our newly created columns (currently called +`0` and `1`) to the more meaningful names `"most_at_home"` and `"most_at_work"`, +and drop the `value` column from our data frame using the `drop` method. + ```{code-cell} ipython3 -tidy_lang.dtypes +:tags: ["output_scroll"] +tidy_lang = ( + tidy_lang.rename(columns={0: "most_at_home", 1: "most_at_work"}) + .drop(columns=["value"]) +) +tidy_lang ``` - +Note that we could have chained these steps together to make our code more compact. Is this data set now tidy? If we recall the three criteria for tidy data: - each row is a single observation, @@ -853,57 +796,36 @@ Is this data set now tidy? If we recall the three criteria for tidy data: - each value is a single cell. We can see that this data now satisfies all three criteria, making it easier to -analyze. But we aren't done yet! Notice in the table, all of the variables are -"object" data types. Object data types are columns of strings or columns with mixed types. In the previous example in Section {ref}`pivot-wider`, the -`most_at_home` and `most_at_work` variables were `int64` (integer)—you can -verify this by calling `df.dtypes`—which is a type -of numeric data. This change is due to the delimiter (`/`) when we read in this -messy data set. Python read these columns in as string types, and by default, -`.str.split` will return columns as object data types. - -It makes sense for `region`, `category`, and `language` to be stored as a -object type. However, suppose we want to apply any functions that treat the -`most_at_home` and `most_at_work` columns as a number (e.g., finding rows -above a numeric threshold of a column). -In that case, -it won't be possible to do if the variable is stored as a `object`. -Fortunately, the `pandas.to_numeric` function provides a natural way to fix problems -like this: it will convert the columns to the best numeric data types. - +analyze. But we aren't done yet! Although we can't see it in the data frame above, all of the variables are actually +"object" data types. We can check this using the `info` method. ```{code-cell} ipython3 -:tags: [remove-cell] - -# We can see that this data now satisfies all three criteria, making it easier to -# analyze. But we aren't done yet! Notice in the table above that the word -# `` appears beneath each of the column names. The word under the column name -# indicates the data type of each column. Here all of the variables are -# "character" data types. Recall, character data types are letter(s) or digits(s) -# surrounded by quotes. In the previous example in Section \@ref(pivot-wider), the -# `most_at_home` and `most_at_work` variables were `` (double)—you can -# verify this by looking at the tables in the previous sections—which is a type -# of numeric data. This change is due to the delimiter (`/`) when we read in this -# messy data set. R read these columns in as character types, and by default, -# `separate` will return columns as character data types. - -# It makes sense for `region`, `category`, and `language` to be stored as a -# character (or perhaps factor) type. However, suppose we want to apply any functions that treat the -# `most_at_home` and `most_at_work` columns as a number (e.g., finding rows -# above a numeric threshold of a column). -# In that case, -# it won't be possible to do if the variable is stored as a `character`. -# Fortunately, the `separate` function provides a natural way to fix problems -# like this: we can set `convert = TRUE` to convert the `most_at_home` -# and `most_at_work` columns to the correct data type. +tidy_lang.info() ``` +Object columns in `pandas` data frames are columns of strings or columns with +mixed types. In the previous example in the section on {ref}`pivot-wider`, the +`most_at_home` and `most_at_work` variables were `int64` (integer), which is a type of numeric data. +This change is due to the delimiter (`/`) when we read in this messy data set. +Python read these columns in as string types, and by default, `str.split` will +return columns with the `object` data type. + +It makes sense for `region`, `category`, and `language` to be stored as an +`object` type. However, suppose we want to apply any functions that treat the +`most_at_home` and `most_at_work` columns as a number (e.g., finding rows +above a numeric threshold of a column). +That won't be possible if the variable is stored as a `object`. +Fortunately, the `pandas.to_numeric` function provides a natural way to fix problems +like this: it will convert the columns to the best numeric data types. + ```{code-cell} ipython3 +:tags: ["output_scroll"] tidy_lang["most_at_home"] = pd.to_numeric(tidy_lang["most_at_home"]) tidy_lang["most_at_work"] = pd.to_numeric(tidy_lang["most_at_work"]) tidy_lang ``` ```{code-cell} ipython3 -tidy_lang.dtypes +tidy_lang.info() ``` Now we see `most_at_home` and `most_at_work` columns are of `int64` data types, @@ -911,122 +833,35 @@ indicating they are integer data types (i.e., numbers)! +++ -(loc-iloc)= -## Using `.loc[]` and `.iloc[]` to extract a range of columns - -```{index} pandas.DataFrame; loc[] -``` - -Now that the `tidy_lang` data is indeed *tidy*, we can start manipulating it -using the powerful suite of functions from the `pandas`. -For the first example, recall `.loc[]` from Chapter {ref}`intro`, -which lets us create a subset of columns from a data frame. -Suppose we wanted to select only the columns `language`, `region`, -`most_at_home` and `most_at_work` from the `tidy_lang` data set. Using what we -learned in Chapter {ref}`intro`, we would pass all of these column names into the square brackets: - -```{code-cell} ipython3 -selected_columns = tidy_lang.loc[:, ["language", "region", "most_at_home", "most_at_work"]] -selected_columns -``` - -```{index} pandas.DataFrame; iloc[], column range -``` - -Here we wrote out the names of each of the columns. However, this method is -time-consuming, especially if you have a lot of columns! Another approach is to -index with integers. `.iloc[]` make it easier for -us to select columns. For instance, we can use `.iloc[]` to choose a -range of columns rather than typing each column name out. To do this, we use the -colon (`:`) operator to denote the range. For example, to get all the columns in -the `tidy_lang` data frame from `language` to `most_at_work`, we pass `:` before the comma indicating we want to retrieve all rows, and `1:` after the comma indicating we want only columns from index 1 (*i.e.* `language`) and afterwords. - -```{code-cell} ipython3 -:tags: [remove-cell] - -# Here we wrote out the names of each of the columns. However, this method is -# time-consuming, especially if you have a lot of columns! Another approach is to -# use a "select helper". Select helpers are operators that make it easier for -# us to select columns. For instance, we can use a select helper to choose a -# range of columns rather than typing each column name out. To do this, we use the -# colon (`:`) operator to denote the range. For example, to get all the columns in \index{column range} -# the `tidy_lang` data frame from `language` to `most_at_work` we pass -# `language:most_at_work` as the second argument to the `select` function. -``` - -```{code-cell} ipython3 -column_range = tidy_lang.iloc[:, 1:] -column_range -``` - -Notice that we get the same output as we did above, -but with less (and clearer!) code. This type of operator -is especially handy for large data sets. - -```{index} pandas.Series; str.startswith -``` - -Suppose instead we wanted to extract columns that followed a particular pattern -rather than just selecting a range. For example, let's say we wanted only to select the -columns `most_at_home` and `most_at_work`. There are other functions that allow -us to select variables based on their names. In particular, we can use the `.str.startswith` method -to choose only the columns that start with the word "most": - -```{code-cell} ipython3 -tidy_lang.loc[:, tidy_lang.columns.str.startswith('most')] -``` - -```{index} pandas.Series; str.contains -``` - -We could also have chosen the columns containing an underscore `_` by using the -`.str.contains("_")`, since we notice -the columns we want contain underscores and the others don't. - -```{code-cell} ipython3 -tidy_lang.loc[:, tidy_lang.columns.str.contains('_')] -``` - -There are many different functions that help with selecting -variables based on certain criteria. -The additional resources section at the end of this chapter -provides a comprehensive resource on these functions. - -```{code-cell} ipython3 -:tags: [remove-cell] - -# There are many different `select` helpers that select -# variables based on certain criteria. -# The additional resources section at the end of this chapter -# provides a comprehensive resource on `select` helpers. -``` - -## Using `df[]` to extract rows +## Using `[]` to extract rows or columns -Next, we revisit the `df[]` from Chapter {ref}`intro`, -which lets us create a subset of rows from a data frame. -Recall the argument to the `df[]`: -column names or a logical statement evaluated to either `True` or `False`; -`df[]` works by returning the rows where the logical statement evaluates to `True`. -This section will highlight more advanced usage of the `df[]` function. +Now that the `tidy_lang` data is indeed *tidy*, we can start manipulating it +using the powerful suite of functions from the `pandas`. +We revisit the `[]` from the chapter on {ref}`intro`, +which lets us create a subset of rows from a data frame. +Recall the argument to `[]`: +a list of column names, or a logical statement that evaluates to either `True` or `False`, +where `[]` returns the rows where the logical statement evaluates to `True`. +This section will highlight more advanced usage of the `[]` function. In particular, this section provides an in-depth treatment of the variety of logical statements -one can use in the `df[]` to select subsets of rows. +one can use in the `[]` to select subsets of rows. +++ ### Extracting rows that have a certain value with `==` Suppose we are only interested in the subset of rows in `tidy_lang` corresponding to the official languages of Canada (English and French). -We can extract these rows by using the *equivalency operator* (`==`) -to compare the values of the `category` column -with the value `"Official languages"`. -With these arguments, `df[]` returns a data frame with all the columns -of the input data frame -but only the rows we asked for in the logical statement, i.e., +We can extract these rows by using the *equivalency operator* (`==`) +to compare the values of the `category` column +with the value `"Official languages"`. +With these arguments, `[]` returns a data frame with all the columns +of the input data frame +but only the rows we asked for in the logical statement, i.e., those where the `category` column holds the value `"Official languages"`. We name this data frame `official_langs`. ```{code-cell} ipython3 +:tags: ["output_scroll"] official_langs = tidy_lang[tidy_lang["category"] == "Official languages"] official_langs ``` @@ -1034,30 +869,34 @@ official_langs ### Extracting rows that do not have a certain value with `!=` What if we want all the other language categories in the data set *except* for -those in the `"Official languages"` category? We can accomplish this with the `!=` +those in the `"Official languages"` category? We can accomplish this with the `!=` operator, which means "not equal to". So if we want to find all the rows where the `category` does *not* equal `"Official languages"` we write the code below. ```{code-cell} ipython3 +:tags: ["output_scroll"] tidy_lang[tidy_lang["category"] != "Official languages"] ``` (filter-and)= ### Extracting rows satisfying multiple conditions using `&` -Suppose now we want to look at only the rows -for the French language in MontrĆ©al. -To do this, we need to filter the data set -to find rows that satisfy multiple conditions simultaneously. +Suppose now we want to look at only the rows +for the French language in MontrĆ©al. +To do this, we need to filter the data set +to find rows that satisfy multiple conditions simultaneously. We can do this with the ampersand symbol (`&`), which -is interpreted by Python as "and". -We write the code as shown below to filter the `official_langs` data frame -to subset the rows where `region == "MontrĆ©al"` -*and* the `language == "French"`. +is interpreted by Python as "and". +We write the code as shown below to filter the `official_langs` data frame +to subset the rows where `region == "MontrĆ©al"` +*and* `language == "French"`. ```{code-cell} ipython3 -tidy_lang[(tidy_lang["region"] == "MontrĆ©al") & (tidy_lang["language"] == "French")] +tidy_lang[ + (tidy_lang["region"] == "MontrĆ©al") & + (tidy_lang["language"] == "French") +] ``` +++ {"tags": []} @@ -1065,37 +904,39 @@ tidy_lang[(tidy_lang["region"] == "MontrĆ©al") & (tidy_lang["language"] == "Fren ### Extracting rows satisfying at least one condition using `|` Suppose we were interested in only those rows corresponding to cities in Alberta -in the `official_langs` data set (Edmonton and Calgary). +in the `official_langs` data set (Edmonton and Calgary). We can't use `&` as we did above because `region` -cannot be both Edmonton *and* Calgary simultaneously. -Instead, we can use the vertical pipe (`|`) logical operator, -which gives us the cases where one condition *or* -another condition *or* both are satisfied. +cannot be both Edmonton *and* Calgary simultaneously. +Instead, we can use the vertical pipe (`|`) logical operator, +which gives us the cases where one condition *or* +another condition *or* both are satisfied. In the code below, we ask Python to return the rows where the `region` columns are equal to "Calgary" *or* "Edmonton". ```{code-cell} ipython3 official_langs[ - (official_langs["region"] == "Calgary") | (official_langs["region"] == "Edmonton") + (official_langs["region"] == "Calgary") | + (official_langs["region"] == "Edmonton") ] ``` -### Extracting rows with values in a list using `.isin()` +### Extracting rows with values in a list using `isin` -Next, suppose we want to see the populations of our five cities. -Let's read in the `region_data.csv` file -that comes from the 2016 Canadian census, -as it contains statistics for number of households, land area, population +Next, suppose we want to see the populations of our five cities. +Let's read in the `region_data.csv` file +that comes from the 2016 Canadian census, +as it contains statistics for number of households, land area, population and number of dwellings for different regions. ```{code-cell} ipython3 +:tags: ["output_scroll"] region_data = pd.read_csv("data/region_data.csv") region_data ``` -To get the population of the five cities -we can filter the data set using the `.isin` method. -The `.isin` method is used to see if an element belongs to a list. +To get the population of the five cities +we can filter the data set using the `isin` method. +The `isin` method is used to see if an element belongs to a list. Here we are filtering for rows where the value in the `region` column matches any of the five cities we are intersted in: Toronto, MontrĆ©al, Vancouver, Calgary, and Edmonton. @@ -1106,7 +947,7 @@ five_cities = region_data[region_data["region"].isin(city_names)] five_cities ``` -> **Note:** What's the difference between `==` and `.isin`? Suppose we have two +> **Note:** What's the difference between `==` and `isin`? Suppose we have two > Series, `seriesA` and `seriesB`. If you type `seriesA == seriesB` into Python it > will compare the series element by element. Python checks if the first element of > `seriesA` equals the first element of `seriesB`, the second element of @@ -1114,7 +955,7 @@ five_cities > `seriesA.isin(seriesB)` compares the first element of `seriesA` to all the > elements in `seriesB`. Then the second element of `seriesA` is compared > to all the elements in `seriesB`, and so on. Notice the difference between `==` and -> `.isin` in the example below. +> `isin` in the example below. ```{code-cell} ipython3 pd.Series(["Vancouver", "Toronto"]) == pd.Series(["Toronto", "Vancouver"]) @@ -1124,25 +965,6 @@ pd.Series(["Vancouver", "Toronto"]) == pd.Series(["Toronto", "Vancouver"]) pd.Series(["Vancouver", "Toronto"]).isin(pd.Series(["Toronto", "Vancouver"])) ``` -```{code-cell} ipython3 -:tags: [remove-cell] - -# > **Note:** What's the difference between `==` and `%in%`? Suppose we have two -# > vectors, `vectorA` and `vectorB`. If you type `vectorA == vectorB` into R it -# > will compare the vectors element by element. R checks if the first element of -# > `vectorA` equals the first element of `vectorB`, the second element of -# > `vectorA` equals the second element of `vectorB`, and so on. On the other hand, -# > `vectorA %in% vectorB` compares the first element of `vectorA` to all the -# > elements in `vectorB`. Then the second element of `vectorA` is compared -# > to all the elements in `vectorB`, and so on. Notice the difference between `==` and -# > `%in%` in the example below. -# > -# >``` {r} -# >c("Vancouver", "Toronto") == c("Toronto", "Vancouver") -# >c("Vancouver", "Toronto") %in% c("Toronto", "Vancouver") -# >``` -``` - ### Extracting rows above or below a threshold using `>` and `<` ```{code-cell} ipython3 @@ -1152,1262 +974,828 @@ glue("census_popn", "{0:,.0f}".format(35151728)) glue("most_french", "{0:,.0f}".format(2669195)) ``` -We saw in Section {ref}`filter-and` that -{glue:text}`most_french` people reported -speaking French in MontrĆ©al as their primary language at home. -If we are interested in finding the official languages in regions -with higher numbers of people who speak it as their primary language at home -compared to French in MontrĆ©al, then we can use `df[]` to obtain rows -where the value of `most_at_home` is greater than -{glue:text}`most_french`. +We saw in the section on {ref}`filter-and` that +{glue:text}`most_french` people reported +speaking French in MontrĆ©al as their primary language at home. +If we are interested in finding the official languages in regions +with higher numbers of people who speak it as their primary language at home +compared to French in MontrĆ©al, then we can use `[]` to obtain rows +where the value of `most_at_home` is greater than +{glue:text}`most_french`. We use the `>` symbol to look for values *above* a threshold, +and the `<` symbol to look for values *below* a threshold. The `>=` and `<=` +symbols similarly look for *equal to or above* a threshold and *equal to or below* a threshold. ```{code-cell} ipython3 official_langs[official_langs["most_at_home"] > 2669195] ``` -This operation returns a data frame with only one row, indicating that when -considering the official languages, -only English in Toronto is reported by more people -as their primary language at home +This operation returns a data frame with only one row, indicating that when +considering the official languages, +only English in Toronto is reported by more people +as their primary language at home than French in MontrĆ©al according to the 2016 Canadian census. -+++ {"tags": []} +### Extracting rows using `query` -(pandas-assign)= -## Using `.assign` to modify or add columns +You can also extract rows above, below, equal or not-equal to a threshold using the +`query` method. For example the following gives us the same result as when we used +`official_langs[official_langs["most_at_home"] > 2669195]`. -+++ +```{code-cell} ipython3 +official_langs.query("most_at_home > 2669195") +``` -### Using `.assign` to modify columns +The query (criteria we are using to select values) is input as a string. The `query` method +is less often used than the earlier approaches we introduced, but it can come in handy +to make long chains of filtering operations a bit easier to read. -```{index} pandas.DataFrame; df[] +(loc-iloc)= +## Using `loc[]` to filter rows and select columns. +```{index} pandas.DataFrame; loc[] ``` -In Section {ref}`str-split`, -when we first read in the `"region_lang_top5_cities_messy.csv"` data, -all of the variables were "object" data types. -During the tidying process, -we used the `pandas.to_numeric` function -to convert the `most_at_home` and `most_at_work` columns -to the desired integer (i.e., numeric class) data types and then used `df[]` to overwrite columns. -But suppose we didn't use the `df[]`, -and needed to modify the columns some other way. -Below we create such a situation -so that we can demonstrate how to use `.assign` -to change the column types of a data frame. -`.assign` is a useful function to modify or create new data frame columns. +The `[]` operation is only used when you want to filter rows or select columns; +it cannot be used to do both operations at the same time. This is where `loc[]` +comes in. For the first example, recall `loc[]` from Chapter {ref}`intro`, +which lets us create a subset of columns from a data frame. +Suppose we wanted to select only the columns `language`, `region`, +`most_at_home` and `most_at_work` from the `tidy_lang` data set. Using what we +learned in the chapter on {ref}`intro`, we would pass all of these column names into the square brackets. ```{code-cell} ipython3 -lang_messy = pd.read_csv("data/region_lang_top5_cities_messy.csv") -lang_messy_longer = lang_messy.melt( - id_vars=["category", "language"], - value_vars=["Toronto", "MontrĆ©al", "Vancouver", "Calgary", "Edmonton"], - var_name="region", - value_name="value", -) -tidy_lang_obj = ( - pd.concat( - (lang_messy_longer, lang_messy_longer["value"].str.split("/", expand=True)), - axis=1, - ) - .rename(columns={0: "most_at_home", 1: "most_at_work"}) - .drop(columns=["value"]) -) -official_langs_obj = tidy_lang_obj[tidy_lang_obj["category"] == "Official languages"] - -official_langs_obj +:tags: ["output_scroll"] +selected_columns = tidy_lang.loc[:, ["language", "region", "most_at_home", "most_at_work"]] +selected_columns ``` +We pass `:` before the comma indicating we want to retrieve all rows, and the list indicates +the columns that we want. + +Note that we could obtain the same result by stating that we would like all of the columns +from `language` through `most_at_work`. Instead of passing a list of all of the column +names that we want, we can ask for the range of columns `"language":"most_at_work"`, which +you can read as "The columns from `language` to `most_at_work`". ```{code-cell} ipython3 -official_langs_obj.dtypes +:tags: ["output_scroll"] +selected_columns = tidy_lang.loc[:, "language":"most_at_work"] +selected_columns ``` -To use the `.assign` method, again we first specify the object to be the data set, -and in the following arguments, -we specify the name of the column we want to modify or create -(here `most_at_home` and `most_at_work`), an `=` sign, -and then the function we want to apply (here `pandas.to_numeric`). -In the function we want to apply, -we refer to the column upon which we want it to act -(here `most_at_home` and `most_at_work`). -In our example, we are naming the columns the same -names as columns that already exist in the data frame -("most\_at\_home", "most\_at\_work") -and this will cause `.assign` to *overwrite* those columns -(also referred to as modifying those columns *in-place*). -If we were to give the columns a new name, -then `.assign` would create new columns with the names we specified. -`.assign`'s general syntax is detailed in {numref}`fig:img-assign`. - -+++ {"tags": []} - -```{figure} img/wrangling/pandas_assign_args_labels.png -:name: fig:img-assign -:figclass: caption-hack +Similarly, you can ask for all of the columns including and after `language` by doing the following -Syntax for the `.assign` function. +```{code-cell} ipython3 +:tags: ["output_scroll"] +selected_columns = tidy_lang.loc[:, "language":] +selected_columns ``` -+++ +By not putting anything after the `:`, python reads this as "from `language` until the last column". +Although the notation for selecting a range using `:` is convienent because less code is required, +it must be used carefully. If you were to re-order columns or add a column to the data frame, the +output would change. Using a list is more explicit and less prone to potential confusion. -Below we use `.assign` to convert the columns `most_at_home` and `most_at_work` -to numeric data types in the `official_langs` data set as described in -{numref}`fig:img-assign`: +Suppose instead we wanted to extract columns that followed a particular pattern +rather than just selecting a range. For example, let's say we wanted only to select the +columns `most_at_home` and `most_at_work`. There are other functions that allow +us to select variables based on their names. In particular, we can use the `.str.startswith` method +to choose only the columns that start with the word "most": ```{code-cell} ipython3 -official_langs_numeric = official_langs_obj.assign( - most_at_home=pd.to_numeric(official_langs_obj["most_at_home"]), - most_at_work=pd.to_numeric(official_langs_obj["most_at_work"]), -) - -official_langs_numeric +tidy_lang.loc[:, tidy_lang.columns.str.startswith('most')] ``` -```{code-cell} ipython3 -official_langs_numeric.dtypes +```{index} pandas.Series; str.contains ``` -Now we see that the `most_at_home` and `most_at_work` columns are both `int64` (which is a numeric data type)! +We could also have chosen the columns containing an underscore `_` by using the +`.str.contains("_")`, since we notice +the columns we want contain underscores and the others don't. -+++ +```{code-cell} ipython3 +tidy_lang.loc[:, tidy_lang.columns.str.contains('_')] +``` -### Using `.assign` to create new columns +There are many different functions that help with selecting +variables based on certain criteria. +The additional resources section at the end of this chapter +provides a comprehensive resource on these functions. ```{code-cell} ipython3 :tags: [remove-cell] -number_most_home = int( - official_langs[ - (official_langs["language"] == "English") - & (official_langs["region"] == "Toronto") - ]["most_at_home"] -) - -toronto_popn = int(region_data[region_data["region"] == "Toronto"]["population"]) - -glue("number_most_home", "{0:,.0f}".format(number_most_home)) -glue("toronto_popn", "{0:,.0f}".format(toronto_popn)) -glue("prop_eng_tor", "{0:.2f}".format(number_most_home / toronto_popn)) +# There are many different `select` helpers that select +# variables based on certain criteria. +# The additional resources section at the end of this chapter +# provides a comprehensive resource on `select` helpers. ``` -We can see in the table that -{glue:text}`number_most_home` people reported -speaking English in Toronto as their primary language at home, according to -the 2016 Canadian census. What does this number mean to us? To understand this -number, we need context. In particular, how many people were in Toronto when -this data was collected? From the 2016 Canadian census profile, the population -of Toronto was reported to be -{glue:text}`toronto_popn` people. -The number of people who report that English is their primary language at home -is much more meaningful when we report it in this context. -We can even go a step further and transform this count to a relative frequency -or proportion. -We can do this by dividing the number of people reporting a given language -as their primary language at home by the number of people who live in Toronto. -For example, -the proportion of people who reported that their primary language at home -was English in the 2016 Canadian census was {glue:text}`prop_eng_tor` -in Toronto. - -Let's use `.assign` to create a new column in our data frame -that holds the proportion of people who speak English -for our five cities of focus in this chapter. -To accomplish this, we will need to do two tasks -beforehand: - -1. Create a list containing the population values for the cities. -2. Filter the `official_langs` data frame -so that we only keep the rows where the language is English. - -To create a list containing the population values for the five cities -(Toronto, MontrĆ©al, Vancouver, Calgary, Edmonton), -we will use the `[]` (recall that we can also use `list()` to create a list): - -```{code-cell} ipython3 -city_pops = [5928040, 4098927, 2463431, 1392609, 1321426] -city_pops +## Using `iloc[]` to extract a range of columns +```{index} pandas.DataFrame; iloc[], column range ``` - -And next, we will filter the `official_langs` data frame -so that we only keep the rows where the language is English. -We will name the new data frame we get from this `english_langs`: +Another approach for selecting columns is to use `iloc[]`, +which provides the ability to index with integers rather than the names of the columns. +For example, the column names of the `tidy_lang` data frame are +`['category', 'language', 'region', 'most_at_home', 'most_at_work']`. +Using `iloc[]`, you can ask for the `language` column by requesting the +column at index `1` (remember that Python starts counting at `0`, so the second item `'language'` +has index `1`!). ```{code-cell} ipython3 -english_langs = official_langs[official_langs["language"] == "English"] -english_langs +column = tidy_lang.iloc[:, 1] +column ``` -Finally, we can use `.assign` to create a new column, -named `most_at_home_proportion`, that will have value that corresponds to -the proportion of people reporting English as their primary -language at home. -We will compute this by dividing the column by our vector of city populations. +You can also ask for multiple columns, just like we did with `[]`. We pass `:` before +the comma, indicating we want to retrieve all rows, and `1:` after the comma +indicating we want columns after and including index 1 (*i.e.* `language`). ```{code-cell} ipython3 -english_langs = english_langs.assign( - most_at_home_proportion=english_langs["most_at_home"] / city_pops -) - -english_langs +column_range = tidy_lang.iloc[:, 1:] +column_range ``` -In the computation above, we had to ensure that we ordered the `city_pops` vector in the -same order as the cities were listed in the `english_langs` data frame. -This is because Python will perform the division computation we did by dividing -each element of the `most_at_home` column by each element of the -`city_pops` list, matching them up by position. -Failing to do this would have resulted in the incorrect math being performed. +The `iloc[]` method is less commonly used, and needs to be used with care. +For example, it is easy to +accidentally put in the wrong integer index! If you did not correctly remember +that the `language` column was index `1`, and used `2` instead, your code +would end up having a bug that might be quite hard to track down. -> **Note:** In more advanced data wrangling, -> one might solve this problem in a less error-prone way though using -> a technique called "joins". -> We link to resources that discuss this in the additional -> resources at the end of this chapter. +```{index} pandas.Series; str.startswith +``` -+++ ++++ {"tags": []} - +## Aggregating data +++ -## Combining functions by chaining the methods +### Calculating summary statistics on individual columns -```{index} chaining methods +```{index} summarize ``` -In Python, we often have to call multiple methods in a sequence to process a data -frame. The basic ways of doing this can become quickly unreadable if there are -many steps. For example, suppose we need to perform three operations on a data -frame called `data`: +As a part of many data analyses, we need to calculate a summary value for the +data (a *summary statistic*). +Examples of summary statistics we might want to calculate +are the number of observations, the average/mean value for a column, +the minimum value, etc. +Oftentimes, +this summary statistic is calculated from the values in a data frame column, +or columns, as shown in {numref}`fig:summarize`. -1) add a new column `new_col` that is double another `old_col`, -2) filter for rows where another column, `other_col`, is more than 5, and -3) select only the new column `new_col` for those rows. ++++ {"tags": []} -One way of performing these three steps is to just write -multiple lines of code, storing temporary objects as you go: +```{figure} img/summarize/summarize.001.jpeg +:name: fig:summarize +:figclass: figure -```{code-cell} ipython3 -:tags: [remove-cell] +Calculating summary statistics on one or more column(s) in `pandas` generally +creates a series or data frame containing the summary statistic(s) for each column +being summarized. The darker, top row of each table represents column headers. +``` -# ## Combining functions using the pipe operator, `|>` ++++ -# In R, we often have to call multiple functions in a sequence to process a data -# frame. The basic ways of doing this can become quickly unreadable if there are -# many steps. For example, suppose we need to perform three operations on a data -# frame called `data`: \index{pipe}\index{aaapipesymb@\vert{}>|see{pipe}} -``` +We will start by showing how to compute the minimum and maximum number of Canadians reporting a particular +language as their primary language at home. First, a reminder of what `region_lang` looks like: ```{code-cell} ipython3 -:tags: [remove-cell] - -data = pd.DataFrame({"old_col": [1, 2, 5, 0], "other_col": [1, 10, 3, 6]}) -``` - -```{code-cell} ipython3 -:tags: [remove-output] - -output_1 = data.assign(new_col=data["old_col"] * 2) -output_2 = output_1[output_1["other_col"] > 5] -output = output_2.loc[:, "new_col"] +:tags: ["output_scroll"] +region_lang = pd.read_csv("data/region_lang.csv") +region_lang ``` -This is difficult to understand for multiple reasons. The reader may be tricked -into thinking the named `output_1` and `output_2` objects are important for some -reason, while they are just temporary intermediate computations. Further, the -reader has to look through and find where `output_1` and `output_2` are used in -each subsequent line. - -+++ - -Chaining the sequential functions solves this problem, resulting in cleaner and -easier-to-follow code. -The code below accomplishes the same thing as the previous -two code blocks: +We use `.min` to calculate the minimum +and `.max` to calculate maximum number of Canadians +reporting a particular language as their primary language at home, +for any region. ```{code-cell} ipython3 -:tags: [remove-output] - -output = ( - data.assign(new_col=data["old_col"] * 2) - .query("other_col > 5") - .loc[:, "new_col"] -) +region_lang["most_at_home"].min() ``` ```{code-cell} ipython3 -:tags: [remove-cell] - -# ``` {r eval = F} -# output <- select(filter(mutate(data, new_col = old_col * 2), -# other_col > 5), -# new_col) -# ``` -# Code like this can also be difficult to understand. Functions compose (reading -# from left to right) in the *opposite order* in which they are computed by R -# (above, `mutate` happens first, then `filter`, then `select`). It is also just a -# really long line of code to read in one go. - -# The *pipe operator* (`|>`) solves this problem, resulting in cleaner and -# easier-to-follow code. `|>` is built into R so you don't need to load any -# packages to use it. -# You can think of the pipe as a physical pipe. It takes the output from the -# function on the left-hand side of the pipe, and passes it as the first argument -# to the function on the right-hand side of the pipe. -# The code below accomplishes the same thing as the previous -# two code blocks: -``` - -> **Note:** You might also have noticed that we split the function calls across -> lines, similar to when we did this earlier in the chapter -> for long function calls. Again, this is allowed and recommended, especially when -> the chained function calls create a long line of code. Doing this makes -> your code more readable. When you do this, it is important to use parentheses -> to tell Python that your code is continuing onto the next line. +region_lang["most_at_home"].max() +``` ```{code-cell} ipython3 :tags: [remove-cell] - -# > **Note:** You might also have noticed that we split the function calls across -# > lines after the pipe, similar to when we did this earlier in the chapter -# > for long function calls. Again, this is allowed and recommended, especially when -# > the piped function calls create a long line of code. Doing this makes -# > your code more readable. When you do this, it is important to end each line -# > with the pipe operator `|>` to tell R that your code is continuing onto the -# > next line. - -# > **Note:** In this textbook, we will be using the base R pipe operator syntax, `|>`. -# > This base R `|>` pipe operator was inspired by a previous version of the pipe -# > operator, `%>%`. The `%>%` pipe operator is not built into R -# > and is from the `magrittr` R package. -# > The `tidyverse` metapackage imports the `%>%` pipe operator via `dplyr` -# > (which in turn imports the `magrittr` R package). -# > There are some other differences between `%>%` and `|>` related to -# > more advanced R uses, such as sharing and distributing code as R packages, -# > however, these are beyond the scope of this textbook. -# > We have this note in the book to make the reader aware that `%>%` exists -# > as it is still commonly used in data analysis code and in many data science -# > books and other resources. -# > In most cases these two pipes are interchangeable and either can be used. - -# \index{pipe}\index{aaapipesymbb@\%>\%|see{pipe}} -``` - -### Chaining `df[]` and `.loc` - -+++ - -Let's work with the tidy `tidy_lang` data set from Section {ref}`str-split`, -which contains the number of Canadians reporting their primary language at home -and work for five major cities -(Toronto, MontrĆ©al, Vancouver, Calgary, and Edmonton): - -```{code-cell} ipython3 -tidy_lang +glue("lang_most_people", "{0:,.0f}".format(int(region_lang["most_at_home"].max()))) ``` -Suppose we want to create a subset of the data with only the languages and -counts of each language spoken most at home for the city of Vancouver. To do -this, we can use the `df[]` and `.loc`. First, we use `df[]` to -create a data frame called `van_data` that contains only values for Vancouver. - +From this we see that there are some languages in the data set that no one speaks +as their primary language at home. We also see that the most commonly spoken +primary language at home is spoken by +{glue:text}`lang_most_people` people. If instead we wanted to know the +total number of people in the survey, we could use the `sum` summary statistic method. ```{code-cell} ipython3 -van_data = tidy_lang[tidy_lang["region"] == "Vancouver"] -van_data +region_lang["most_at_home"].sum() ``` -We then use `.loc` on this data frame to keep only the variables we want: +Other handy summary statistics include the `mean`, `median` and `std` for +computing the mean, median, and standard deviation of observations, respectively. +We can also compute multiple statistics at once using `agg` to "aggregate" results. +For example, if we wanted to +compute both the `min` and `max` at once, we could use `agg` with the argument `['min', 'max']`. +Note that `agg` outputs a `Series` object. ```{code-cell} ipython3 -van_data_selected = van_data.loc[:, ["language", "most_at_home"]] -van_data_selected +region_lang["most_at_home"].agg(["min", "max"]) ``` -Although this is valid code, there is a more readable approach we could take by -chaining the operations. With chaining, we do not need to create an intermediate -object to store the output from `df[]`. Instead, we can directly call `.loc` upon the -output of `df[]`: +The `pandas` package also provides the `describe` method, +which is a handy function that computes many common summary statistics at once; it +gives us a *summary* of a variable. ```{code-cell} ipython3 -van_data_selected = tidy_lang[tidy_lang["region"] == "Vancouver"].loc[ - :, ["language", "most_at_home"] -] - -van_data_selected +region_lang["most_at_home"].describe() ``` -```{code-cell} ipython3 -:tags: [remove-cell] +In addition to the summary methods we introduced earlier, the `describe` method +outputs a `count` (the total number of observations, or rows, in our data frame), +as well as the 25th, 50th, and 75th percentiles. +{numref}`tab:basic-summary-statistics` provides an overview of some of the useful +summary statistics that you can compute with `pandas`. -# But wait...Why do the `select` and `filter` function calls -# look different in these two examples? -# Remember: when you use the pipe, -# the output of the first function is automatically provided -# as the first argument for the function that comes after it. -# Therefore you do not specify the first argument in that function call. -# In the code above, -# the first line is just the `tidy_lang` data frame with a pipe. -# The pipe passes the left-hand side (`tidy_lang`) to the first argument of the function on the right (`filter`), -# so in the `filter` function you only see the second argument (and beyond). -# Then again after `filter` there is a pipe, which passes the result of the `filter` step -# to the first argument of the `select` function. +```{table} Basic summary statistics +:name: tab:basic-summary-statistics +| Function | Description | +| -------- | ----------- | +| `count` | The number of observations (rows) | +| `mean` | The mean of the observations | +| `median` | The median value of the observations | +| `std` | The standard deviation of the observations | +| `max` | The largest value in a column | +| `min` | The smallest value in a column | +| `sum` | The sum of all observations | +| `agg` | Aggregate multiple statistics together | +| `describe` | a summary | ``` -As you can see, both of these approaches—with and without chaining—give us the same output, but the second -approach is clearer and more readable. - +++ - -### Chaining more than two functions - +++ -Chaining can be used with any method in Python. -Additionally, we can chain together more than two functions. -For example, we can chain together three functions to: - -- extract rows (`df[]`) to include only those where the counts of the language most spoken at home are greater than 10,000, -- extract only the columns (`.loc`) corresponding to `region`, `language` and `most_at_home`, and -- sort the data frame rows in order (`.sort_values`) by counts of the language most spoken at home -from smallest to largest. -```{index} pandas.DataFrame; sort_values -``` +> **Note:** In `pandas`, the value `NaN` is often used to denote missing data. +> By default, when `pandas` calculates summary statistics (e.g., `max`, `min`, `sum`, etc), +> it ignores these values. If you look at the documentation for these functions, you will +> see an input variable `skipna`, which by default is set to `skipna=True`. This means that +> `pandas` will skip `NaN` values when computing statistics. -As we saw in Chapter {ref}`intro`, -we can use the `.sort_values` function -to order the rows in the data frame by the values of one or more columns. -Here we pass the column name `most_at_home` to sort the data frame rows by the values in that column, in ascending order. +### Calculating summary statistics on data frames +What if you want to calculate summary statistics on an entire data frame? Well, +it turns out that the functions in {numref}`tab:basic-summary-statistics` +can be applied to a whole data frame! +For example, we can ask for the number of rows that each column has using `count`. ```{code-cell} ipython3 -large_region_lang = ( - tidy_lang[tidy_lang["most_at_home"] > 10000] - .loc[:, ["region", "language", "most_at_home"]] - .sort_values(by="most_at_home") -) - -large_region_lang +region_lang.count() ``` - +Not surprisingly, they are all the same. We could also ask for the `mean`, but +some of the columns in `region_lang` contain string data with words like `"Vancouver"` +and `"Halifax"`---for these columns there is no way for `pandas` to compute the mean. +So we provide the keyword `numeric_only=True` so that it only computes the mean of columns with numeric values. This +is also needed if you want the `sum` or `std`. ```{code-cell} ipython3 -:tags: [remove-cell] - -# You will notice above that we passed `tidy_lang` as the first argument of the `filter` function. -# We can also pipe the data frame into the same sequence of functions rather than -# using it as the first argument of the first function. These two choices are equivalent, -# and we get the same result. -# ``` {r} -# large_region_lang <- tidy_lang |> -# filter(most_at_home > 10000) |> -# select(region, language, most_at_home) |> -# arrange(most_at_home) - -# large_region_lang -# ``` -``` - -Now that we've shown you chaining as an alternative to storing -temporary objects and composing code, does this mean you should *never* store -temporary objects or compose code? Not necessarily! -There are times when you will still want to do these things. -For example, you might store a temporary object before feeding it into a plot function -so you can iteratively change the plot without having to -redo all of your data transformations. -Additionally, chaining many functions can be overwhelming and difficult to debug; -you may want to store a temporary object midway through to inspect your result -before moving on with further steps. - -+++ - -## Aggregating data with `.assign`, `.agg` and `.apply` - -+++ - -### Calculating summary statistics on whole columns - -```{index} summarize -``` - -As a part of many data analyses, we need to calculate a summary value for the -data (a *summary statistic*). -Examples of summary statistics we might want to calculate -are the number of observations, the average/mean value for a column, -the minimum value, etc. -Oftentimes, -this summary statistic is calculated from the values in a data frame column, -or columns, as shown in {numref}`fig:summarize`. - -+++ {"tags": []} - -```{figure} img/summarize/summarize.001.jpeg -:name: fig:summarize -:figclass: caption-hack - -Calculating summary statistics on one or more column(s). In its simplest use case, it creates a new data frame with a single row containing the summary statistic(s) for each column being summarized. The darker, top row of each table represents the column headers. +region_lang.mean(numeric_only=True) ``` - -+++ - -We can use `.assign` as mentioned in Section {ref}`pandas-assign` along with proper summary functions to create a aggregated column. - -First a reminder of what `region_lang` looks like: - +If we ask for the `min` or the `max`, `pandas` will give you the smallest or largest number +for columns with numeric values. For columns with text, it will return the +least repeated value for `min` and the most repeated value for `max`. Again, +if you only want the minimum and maximum value for +numeric columns, you can provide `numeric_only=True`. ```{code-cell} ipython3 -:tags: [remove-cell] - -# A useful `dplyr` function for calculating summary statistics is `summarize`, -# where the first argument is the data frame and subsequent arguments -# are the summaries we want to perform. -# Here we show how to use the `summarize` function to calculate the minimum -# and maximum number of Canadians -# reporting a particular language as their primary language at home. -# First a reminder of what `region_lang` looks like: +region_lang.max() ``` - ```{code-cell} ipython3 -region_lang = pd.read_csv("data/region_lang.csv") -region_lang +region_lang.min() ``` -We apply `min` to calculate the minimum -and `max` to calculate maximum number of Canadians -reporting a particular language as their primary language at home, -for any region, and `.assign` a column name to each: - -```{code-cell} ipython3 -:tags: [remove-cell] +Similarly, if there are only some columns for which you would like to get summary statistics, +you can first use `loc[]` and then ask for the summary statistic. An example of this is illustrated in {numref}`fig:summarize-across`. +Later, we will talk about how you can also use a more general function, `apply`, to accomplish this. -pd.DataFrame(region_lang["most_at_home"].agg(["min", "max"])).T +```{figure} img/summarize/summarize.003.jpeg +:name: fig:summarize-across +:figclass: figure -# pd.DataFrame(region_lang["most_at_home"].agg(["min", "max"])).T.rename( -# columns={"min": "min_most_at_home", "max": "max_most_at_home"} -# ) +`loc[]` or `apply` is useful for efficiently calculating summary statistics on +many columns at once. The darker, top row of each table represents the column +headers. ``` +Lets say that we want to know +the mean and standard deviation of all of the columns between `"mother_tongue"` and `"lang_known"`. +We use `loc[]` to specify the columns and then `agg` to ask for both the `mean` and `std`. ```{code-cell} ipython3 -:tags: [] - -lang_summary = pd.DataFrame() -lang_summary = lang_summary.assign(min_most_at_home=[min(region_lang["most_at_home"])]) -lang_summary = lang_summary.assign(max_most_at_home=[max(region_lang["most_at_home"])]) -lang_summary +region_lang.loc[:, "mother_tongue":"lang_known"].agg(["mean", "std"]) ``` -```{code-cell} ipython3 -:tags: [remove-cell] -glue("lang_most_people", "{0:,.0f}".format(int(lang_summary["max_most_at_home"]))) -``` -From this we see that there are some languages in the data set that no one speaks -as their primary language at home. We also see that the most commonly spoken -primary language at home is spoken by -{glue:text}`lang_most_people` -people. +## Performing operations on groups of rows using `groupby` +++ -### Calculating summary statistics when there are `NaN`s - -```{index} missing data +```{index} pandas.DataFrame; groupby ``` +What happens if we want to know how languages vary by region? In this case, +we need a new tool that lets us group rows by region. This can be achieved +using the `groupby` function in `pandas`. Pairing summary functions +with `groupby` lets you summarize values for subgroups within a data set, +as illustrated in {numref}`fig:summarize-groupby`. +For example, we can use `groupby` to group the regions of the `tidy_lang` data +frame and then calculate the minimum and maximum number of Canadians +reporting the language as the primary language at home +for each of the regions in the data set. + ++++ {"tags": []} + +```{figure} img/summarize/summarize.002.jpeg +:name: fig:summarize-groupby +:figclass: figure -```{index} see: NaN; missing data +A summary statistic function paired with `groupby` is useful for calculating that statistic +on one or more column(s) for each group. It +creates a new data frame with one row for each group +and one column for each summary statistic.The darker, top row of each table +represents the column headers. The gray, blue, and green colored rows +correspond to the rows that belong to each of the three groups being +represented in this cartoon example. ``` -In `pandas` DataFrame, the value `NaN` is often used to denote missing data. -Many of the base python statistical summary functions -(e.g., `max`, `min`, `sum`, etc) will return `NaN` -when applied to columns containing `NaN` values. -Usually that is not what we want to happen; -instead, we would usually like Python to ignore the missing entries -and calculate the summary statistic using all of the other non-`NaN` values -in the column. -Fortunately `pandas` provides many equivalent methods (e.g., `.max`, `.min`, `.sum`, etc) to -these summary functions while providing an extra argument `skipna` that lets -us tell the function what to do when it encounters `NaN` values. -In particular, if we specify `skipna=True` (default), the function will ignore -missing values and return a summary of all the non-missing entries. -We show an example of this below. ++++ -First we create a new version of the `region_lang` data frame, -named `region_lang_na`, that has a seemingly innocuous `NaN` -in the first row of the `most_at_home` column: +The `groupby` function takes at least one argument—the columns to use in the +grouping. Here we use only one column for grouping (`region`). ```{code-cell} ipython3 -:tags: [remove-cell] - -# In data frames in R, the value `NA` is often used to denote missing data. -# Many of the base R statistical summary functions -# (e.g., `max`, `min`, `mean`, `sum`, etc) will return `NA` -# when applied to columns containing `NA` values. \index{missing data}\index{NA|see{missing data}} -# Usually that is not what we want to happen; -# instead, we would usually like R to ignore the missing entries -# and calculate the summary statistic using all of the other non-`NA` values -# in the column. -# Fortunately many of these functions provide an argument `na.rm` that lets -# us tell the function what to do when it encounters `NA` values. -# In particular, if we specify `na.rm = TRUE`, the function will ignore -# missing values and return a summary of all the non-missing entries. -# We show an example of this combined with `summarize` below. +region_lang.groupby("region")["most_at_home"].agg(["min", "max"]) ``` +Notice that `groupby` converts a `DataFrame` object to a `DataFrameGroupBy` +object, which contains information about the groups of the data frame. We can +then apply aggregating functions to the `DataFrameGroupBy` object. This can be handy if you would like to perform multiple operations and assign +each output to its own object. ```{code-cell} ipython3 -:tags: [remove-cell] - -region_lang_na = region_lang.copy() -region_lang_na.loc[0, "most_at_home"] = np.nan +region_lang.groupby("region") ``` +You can also pass multiple column names to `groupby`. For example, if we wanted to +know about how the different categories of languages (Aboriginal, Non-Official & +Non-Aboriginal, and Official) are spoken at home in different regions, we would pass a +list including `region` and `category` to `groupby`. ```{code-cell} ipython3 -region_lang_na +region_lang.groupby(["region", "category"])["most_at_home"].agg(["min", "max"]) ``` -Now if we apply the Python built-in summary function as above, -we see that we no longer get the minimum and maximum returned, -but just an `NaN` instead! - +You can also ask for grouped summary statistics on the whole data frame ```{code-cell} ipython3 -lang_summary_na = pd.DataFrame() -lang_summary_na = lang_summary_na.assign( - min_most_at_home=[min(region_lang_na["most_at_home"])] -) -lang_summary_na = lang_summary_na.assign( - max_most_at_home=[max(region_lang_na["most_at_home"])] -) -lang_summary_na +:tags: ["output_scroll"] +region_lang.groupby("region").agg(["min", "max"]) ``` -We can fix this by using the `pandas` Series methods (*i.e.* `.min` and `.max`) with `skipna=True` as explained above: - +If you want to ask for only some columns, for example +the columns between `"most_at_home"` and `"lang_known"`, +you might think about first applying `groupby` and then `loc`; +but `groupby` returns a `DataFrameGroupBy` object, which does not +work with `loc`. The other option is to do things the other way around: +first use `loc`, then use `groupby`. +This usually does work, but you have to be careful! For example, +in our case, if we try using `loc` and then `groupby`, we get an error. ```{code-cell} ipython3 -lang_summary_na = pd.DataFrame() -lang_summary_na = lang_summary_na.assign( - min_most_at_home=[region_lang_na["most_at_home"].min(skipna=True)] -) -lang_summary_na = lang_summary_na.assign( - max_most_at_home=[region_lang_na["most_at_home"].max(skipna=True)] -) -lang_summary_na +:tags: [remove-output] +region_lang.loc[:, "most_at_home":"lang_known"].groupby("region").max() +``` +``` +KeyError: 'region' +``` +This is because when we use `loc` we selected only the columns between +`"most_at_home"` and `"lang_known"`, which doesn't include `"region"`! +Instead, we need to call `loc` with a list of column names that +includes `region`, and then use `groupby`. +```{code-cell} ipython3 +:tags: ["output_scroll"] +region_lang.loc[ + :, + ["region", "mother_tongue", "most_at_home", "most_at_work", "lang_known"] +].groupby("region").max() ``` - -### Calculating summary statistics for groups of rows +++ -```{index} pandas.DataFrame; groupby -``` - -A common pairing with summary functions is `.groupby`. Pairing these functions -together can let you summarize values for subgroups within a data set, -as illustrated in {numref}`fig:summarize-groupby`. -For example, we can use `.groupby` to group the regions of the `tidy_lang` data frame and then calculate the minimum and maximum number of Canadians -reporting the language as the primary language at home -for each of the regions in the data set. +## Apply functions across multiple columns with `apply` -```{code-cell} ipython3 -:tags: [remove-cell] +### Apply a function to each column with `apply` -# A common pairing with `summarize` is `group_by`. Pairing these functions \index{group\_by} -# together can let you summarize values for subgroups within a data set, -# as illustrated in Figure \@ref(fig:summarize-groupby). -# For example, we can use `group_by` to group the regions of the `tidy_lang` data frame and then calculate the minimum and maximum number of Canadians -# reporting the language as the primary language at home -# for each of the regions in the data set. +An alternative to aggregating on a data frame +for applying a function to many columns is the `apply` method. +Let's again find the maximum value of each column of the +`region_lang` data frame, but using `apply` with the `max` function this time. +We focus on the two arguments of `apply`: +the function that you would like to apply to each column, and the `axis` along +which the function will be applied (`0` for columns, `1` for rows). +Note that `apply` does not have an argument +to specify *which* columns to apply the function to. +Therefore, we will use the `loc[]` before calling `apply` +to choose the columns for which we want the maximum. -# (ref:summarize-groupby) `summarize` and `group_by` is useful for calculating summary statistics on one or more column(s) for each group. It creates a new data frame—with one row for each group—containing the summary statistic(s) for each column being summarized. It also creates a column listing the value of the grouping variable. The darker, top row of each table represents the column headers. The gray, blue, and green colored rows correspond to the rows that belong to each of the three groups being represented in this cartoon example. +```{code-cell} ipython3 +region_lang.loc[:, "most_at_home":"most_at_work"].apply(max) ``` +We can use `apply` for much more than summary statistics. +Sometimes we need to apply a function to many columns in a data frame. +For example, we would need to do this when converting units of measurements across many columns. +We illustrate such a data transformation in {numref}`fig:mutate-across`. + +++ {"tags": []} -```{figure} img/summarize/summarize.002.jpeg -:name: fig:summarize-groupby -:figclass: caption-hack +```{figure} img/summarize/summarize.005.jpeg +:name: fig:mutate-across +:figclass: figure -Calculating summary statistics on one or more column(s) for each group. It creates a new data frame—with one row for each group—containing the summary statistic(s) for each column being summarized. It also creates a column listing the value of the grouping variable. The darker, top row of each table represents the column headers. The gray, blue, and green colored rows correspond to the rows that belong to each of the three groups being represented in this cartoon example. +`apply` is useful for applying functions across many columns. The darker, top row of each table represents the column headers. ``` +++ -The `.groupby` function takes at least one argument—the columns to use in the -grouping. Here we use only one column for grouping (`region`), but more than one -can also be used. To do this, pass a list of column names to the `by` argument. +For example, +imagine that we wanted to convert all the numeric columns +in the `region_lang` data frame from `int64` type to `int32` type +using the `.as_type` function. +When we revisit the `region_lang` data frame, +we can see that this would be the columns from `mother_tongue` to `lang_known`. ```{code-cell} ipython3 -region_summary = pd.DataFrame() -region_summary = region_summary.assign( - min_most_at_home=region_lang.groupby(by="region")["most_at_home"].min(), - max_most_at_home=region_lang.groupby(by="region")["most_at_home"].max() -).reset_index() - -region_summary.columns = ["region", "min_most_at_home", "max_most_at_home"] -region_summary +:tags: ["output_scroll"] +region_lang ``` -`pandas` also has a convenient method `.agg` (shorthand for `.aggregate`) that allows us to apply multiple aggregate functions in one line of code. We just need to pass in a list of function names to `.agg` as shown below. +```{index} pandas.DataFrame; apply, pandas.DataFrame; loc[] +``` +To accomplish such a task, we can use `apply`. +As we did above, +we again use `loc[]` to specify the columns +as well as the `apply` to specify the function we want to apply on these columns. +Now, we need a way to tell `apply` what function to perform to each column +so that we can convert them from `int64` to `int32`. We will use what is called +a `lambda` function in python; `lambda` functions are just regular functions, +except that you don't need to give them a name. +That means you can pass them as an argument into `apply` easily! +Let's consider a simple example of a `lambda` function that +multiplies a number by two. ```{code-cell} ipython3 -region_summary = ( - region_lang.groupby(by="region")["most_at_home"].agg(["min", "max"]).reset_index() -) -region_summary.columns = ["region", "min_most_at_home", "max_most_at_home"] -region_summary +lambda x: 2*x ``` - -Notice that `.groupby` converts a `DataFrame` object to a `DataFrameGroupBy` object, which contains information about the groups of the dataframe. We can then apply aggregating functions to the `DataFrameGroupBy` object. - +We define a `lambda` function in the following way. We start with the syntax `lambda`, which is a special word +that tells Python "what follows is +a function." Following this, we then state the name of the arguments of the function. +In this case, we just have one argument named `x`. After the list of arguments, we put a +colon `:`. And finally after the colon are the instructions: take the value provided and multiply it by 2. +Let's call our shiny new `lambda` function with the argument `2` (so the output should be `4`). +Just like a regular function, we pass its argument between parentheses `()` symbols. ```{code-cell} ipython3 -:tags: [remove-cell] - -# Notice that `group_by` on its own doesn't change the way the data looks. -# In the output below, the grouped data set looks the same, -# and it doesn't *appear* to be grouped by `region`. -# Instead, `group_by` simply changes how other functions work with the data, -# as we saw with `summarize` above. +(lambda x: 2*x)(2) ``` +> **Note:** Because we didn't give the `lambda` function a name, we have to surround it with +> parentheses too if we want to call it. Otherwise, if we wrote something like `lambda x: 2*x(2)`, Python would get confused +> and think that `(2)` was part of the instructions that comprise the `lambda` function. +> As long as we don't want to call the `lambda` function ourselves, we don't need those parentheses. For example, +> we can pass a `lambda` function as an argument to `apply` without any parentheses. +Returning to our example, let's use `apply` to convert the columns `"mother_tongue":"lang_known"` +to `int32`. To accomplish this we create a `lambda` function that takes one argument---a single column +of the data frame, which we will name `col`---and apply the `astype` method to it. +Then the `apply` method will use that `lambda` function on every column we specify via `loc[]`. ```{code-cell} ipython3 -region_lang.groupby("region") +region_lang_nums = region_lang.loc[:, "mother_tongue":"lang_known"].apply(lambda col: col.astype("int32")) +region_lang_nums.info() ``` +You can now see that the columns from `mother_tongue` to `lang_known` are type `int32`. +You can also see that `apply` returns a data frame with the same number of columns and rows +as the input data frame. The only thing `apply` does is use the `lambda` function argument +on each of the specified columns. -### Calculating summary statistics on many columns +### Apply a function row-wise with `apply` -+++ +What if you want to apply a function across columns but within one row? +We illustrate such a data transformation in {numref}`fig:rowwise`. -Sometimes we need to summarize statistics across many columns. -An example of this is illustrated in {numref}`fig:summarize-across`. -In such a case, using summary functions alone means that we have to -type out the name of each column we want to summarize. -In this section we will meet two strategies for performing this task. -First we will see how we can do this using `.iloc[]` to slice the columns before applying summary functions. -Then we will also explore how we can use a more general iteration function, -`.apply`, to also accomplish this. ++++ {"tags": []} -```{code-cell} ipython3 -:tags: [remove-cell] +```{figure} img/summarize/summarize.004.jpeg +:name: fig:rowwise +:figclass: figure -# Sometimes we need to summarize statistics across many columns. -# An example of this is illustrated in Figure \@ref(fig:summarize-across). -# In such a case, using `summarize` alone means that we have to -# type out the name of each column we want to summarize. -# In this section we will meet two strategies for performing this task. -# First we will see how we can do this using `summarize` + `across`. -# Then we will also explore how we can use a more general iteration function, -# `map`, to also accomplish this. +`apply` is useful for applying functions across columns within one row. The +darker, top row of each table represents the column headers. ``` -+++ {"tags": []} ++++ -```{figure} img/summarize/summarize.003.jpeg -:name: fig:summarize-across -:figclass: caption-hack +For instance, suppose we want to know the maximum value between `mother_tongue`, +and `lang_known` for each language and region +in the `region_lang_nums` data set. +In other words, we want to apply the `max` function *row-wise.* +In order to tell `apply` that we want to work row-wise (as opposed to acting on each column +individually, which is the default behavior), we just specify the argument `axis=1`. +For example, in the case of the `max` function, this tells Python that we would like +the `max` within each row of the input, as opposed to being applied on each column. -`.iloc[]` or `.apply` is useful for efficiently calculating summary statistics on many columns at once. The darker, top row of each table represents the column headers. +```{code-cell} ipython3 +region_lang_nums.apply(max, axis=1) ``` -+++ +We see that we get a column, which is the maximum value between `mother_tongue`, +`most_at_home`, `most_at_work` and `lang_known` for each language +and region. It is often the case that we want to include a column result +from using `apply` row-wise as a new column in the data frame, so that we can make +plots or continue our analysis. To make this happen, +we will use `assign` to create a new column. This is discussed in the next section. -#### Aggregating on a data frame for calculating summary statistics on many columns +(pandas-assign)= +## Using `assign` to modify or add columns -+++ -```{index} column range +```{index} pandas.DataFrame; [] ``` -Recall that in the Section {ref}`loc-iloc`, we can use `.iloc[]` to extract a range of columns with indices. Here we demonstrate finding the maximum value -of each of the numeric -columns of the `region_lang` data set through pairing `.iloc[]` and `.max`. This means that the -summary methods (*e.g.* `.min`, `.max`, `.sum` etc.) can be used for data frames as well. +### Using `assign` to create new columns -```{code-cell} ipython3 -pd.DataFrame(region_lang.iloc[:, 3:].max(axis=0)).T -``` +When we compute summary statistics with `agg` or apply functions using `apply` +those give us new data frames. But what if we want to append that information +to an existing data frame? This is where we make use of the `assign` method. +For example, say we wanted the maximum values of the `region_lang_nums` data frame, +and to create a new data frame consisting of all the columns of `region_lang` as well as that additional column. +To do this, we will (1) compute the maximum of those columns using `apply`, +and (2) use `assign` to assign values to create a new column in the `region_lang` data frame. +Note that `assign` does not by default modify the data frame itself; it creates a copy +with the new column added to it. +To use the `assign` method, we specify one argument for each column we want to create. +In this case we want to create one new column named `maximum`, so the argument +to `assign` begins with `maximum = `. +Then after the `=`, we specify what the contents of that new column +should be. In this case we use `apply` just as we did in the previous section to give us the maximum values. +Remember to specify `axis=1` in the `apply` method so that we compute the row-wise maximum value. ```{code-cell} ipython3 ---- -jupyter: - source_hidden: true -tags: [remove-cell] ---- -# To summarize statistics across many columns, we can use the -# `summarize` function we have just recently learned about. -# However, in such a case, using `summarize` alone means that we have to -# type out the name of each column we want to summarize. -# To do this more efficiently, we can pair `summarize` with `across` \index{across} -# and use a colon `:` to specify a range of columns we would like \index{column range} -# to perform the statistical summaries on. -# Here we demonstrate finding the maximum value -# of each of the numeric -# columns of the `region_lang` data set. - -# ``` {r 02-across-data} -# region_lang |> -# summarize(across(mother_tongue:lang_known, max)) -# ``` - -# > **Note:** Similar to when we use base R statistical summary functions -# > (e.g., `max`, `min`, `mean`, `sum`, etc) with `summarize` alone, -# > the use of the `summarize` + `across` functions paired -# > with base R statistical summary functions -# > also return `NA`s when we apply them to columns that -# > contain `NA`s in the data frame. \index{missing data} -# > -# > To avoid this, again we need to add the argument `na.rm = TRUE`, -# > but in this case we need to use it a little bit differently. -# > In this case, we need to add a `,` and then `na.rm = TRUE`, -# > after specifying the function we want `summarize` + `across` to apply, -# > as illustrated below: -# > -# > ``` {r} -# > region_lang_na |> -# > summarize(across(mother_tongue:lang_known, max, na.rm = TRUE)) -# > ``` -``` - -(apply-summary)= -#### `.apply` for calculating summary statistics on many columns - -+++ - -```{index} pandas.DataFrame; apply +:tags: ["output_scroll"] +region_lang.assign( + maximum = region_lang_nums.apply(max, axis=1) +) ``` +This gives us a new data frame that looks like the `region_lang` data frame, +except that it has an additional column named `maximum`. +The `maximum` column contains +the maximum value between `mother_tongue`, +`most_at_home`, `most_at_work` and `lang_known` for each language +and region, just as we specified! -An alternative to aggregating on a dataframe -for applying a function to many columns is the `.apply` method. -Let's again find the maximum value of each column of the -`region_lang` data frame, but using `.apply` with the `max` function this time. -We focus on the two arguments of `.apply`: -the function that you would like to apply to each column, and the `axis` along which the function will be applied (`0` for columns, `1` for rows). -Note that `.apply` does not have an argument -to specify *which* columns to apply the function to. -Therefore, we will use the `.iloc[]` before calling `.apply` -to choose the columns for which we want the maximum. ```{code-cell} ipython3 ---- -jupyter: - source_hidden: true -tags: [remove-cell] ---- -# An alternative to `summarize` and `across` -# for applying a function to many columns is the `map` family of functions. \index{map} -# Let's again find the maximum value of each column of the -# `region_lang` data frame, but using `map` with the `max` function this time. -# `map` takes two arguments: -# an object (a vector, data frame or list) that you want to apply the function to, -# and the function that you would like to apply to each column. -# Note that `map` does not have an argument -# to specify *which* columns to apply the function to. -# Therefore, we will use the `select` function before calling `map` -# to choose the columns for which we want the maximum. -``` - -```{code-cell} ipython3 -pd.DataFrame(region_lang.iloc[:, 3:].apply(max, axis=0)).T -``` - -```{index} missing data -``` - -> **Note:** Similar to when we use base Python statistical summary functions -> (e.g., `max`, `min`, `sum`, etc.) when there are `NaN`s, -> `.apply` functions paired with base Python statistical summary functions -> also return `NaN` values when we apply them to columns that -> contain `NaN` values. -> -> To avoid this, again we need to use the `pandas` variants of summary functions (*i.e.* -> `.max`, `.min`, `.sum`, etc.) with `skipna=True`. -> When we use this with `.apply`, we do this by constructing a anonymous function that calls -> the `.max` method with `skipna=True`, as illustrated below: - -```{code-cell} ipython3 -pd.DataFrame( - region_lang_na.iloc[:, 3:].apply(lambda col: col.max(skipna=True), axis=0) -).T -``` - -The `.apply` function is generally quite useful for solving many problems -involving repeatedly applying functions in Python. -Additionally, a variant of `.apply` is `.applymap`, -which can be used to apply functions element-wise. -To learn more about these functions, see the additional resources -section at the end of this chapter. - -+++ {"jp-MarkdownHeadingCollapsed": true, "tags": ["remove-cell"]} - - - -+++ {"tags": []} - -## Apply functions across many columns with `.apply` - -Sometimes we need to apply a function to many columns in a data frame. -For example, we would need to do this when converting units of measurements across many columns. -We illustrate such a data transformation in {numref}`fig:mutate-across`. +:tags: [remove-cell] -+++ {"tags": []} +number_most_home = int( + official_langs[ + (official_langs["language"] == "English") & + (official_langs["region"] == "Toronto") + ]["most_at_home"] +) -```{figure} img/summarize/summarize.005.jpeg -:name: fig:mutate-across -:figclass: caption-hack +toronto_popn = int(region_data[region_data["region"] == "Toronto"]["population"]) -`.apply` is useful for applying functions across many columns. The darker, top row of each table represents the column headers. +glue("number_most_home", "{0:,.0f}".format(number_most_home)) +glue("toronto_popn", "{0:,.0f}".format(toronto_popn)) +glue("prop_eng_tor", "{0:.2f}".format(number_most_home / toronto_popn)) ``` -+++ - -For example, -imagine that we wanted to convert all the numeric columns -in the `region_lang` data frame from `int64` type to `int32` type -using the `.as_type` function. -When we revisit the `region_lang` data frame, -we can see that this would be the columns from `mother_tongue` to `lang_known`. +As another example, we might ask the question: "What proportion of +the population reported English as their primary language at home in the 2016 census?" +For example, in Toronto, {glue:text}`number_most_home` people reported +speaking English as their primary language at home, and the +population of Toronto was reported to be +{glue:text}`toronto_popn` people. So the proportion of people reporting English +as their primary language in Toronto in the 2016 census was {glue:text}`prop_eng_tor`. +How could we figure this out starting from the `region_lang` data frame? -```{code-cell} ipython3 ---- -jupyter: - source_hidden: true -tags: [remove-cell] ---- -# For example, -# imagine that we wanted to convert all the numeric columns -# in the `region_lang` data frame from double type to integer type -# using the `as.integer` function. -# When we revisit the `region_lang` data frame, -# we can see that this would be the columns from `mother_tongue` to `lang_known`. +First, we need to filter the `region_lang` data frame +so that we only keep the rows where the language is English. +We will also restrict our attention to the five major cities +in the `five_cities` data frame: Toronto, MontrĆ©al, Vancouver, Calgary, and Edmonton. +We will filter to keep only those rows pertaining to the English language +and pertaining to the five aforementioned cities. To combine these two logical statements +we will use the `&` symbol. +and with the `[]` operation, + `"English"` as the `language` and filter the rows, +and name the new data frame `english_langs`. +```{code-cell} ipython3 +:tags: ["output_scroll"] +english_lang = region_lang[ + (region_lang["language"] == "English") & + (region_lang["region"].isin(five_cities["region"])) +] +english_lang ``` +Okay, now we have a data frame that pertains only to the English language +and the five cities mentioned earlier. +In order to compute the proportion of the population speaking English in each of these cities, +we need to add the population data from the `five_cities` data frame. ```{code-cell} ipython3 -region_lang +five_cities ``` - -```{index} pandas.DataFrame; apply, pandas.DataFrame; iloc[] +The data frame above shows that the populations of the five cities in 2016 were +5928040 (Toronto), 4098927 (MontrĆ©al), 2463431 (Vancouver), 1392609 (Calgary), and 1321426 (Edmonton). +We will add this information to our data frame in a new column named `city_pops` by using `assign`. +Once again we specify the new column name (`city_pops`) as the argument, followed by the equal symbol `=`, +and finally the data in the column. +Note that the order of the rows in the `english_lang` data frame is MontrĆ©al, Toronto, Calgary, Edmonton, Vancouver. +So we will create a column called `city_pops` where we list the populations of those cities in that +order, and add it to our data frame. +Also note that we write `english_lang = ` on the left so that the newly created data frame overwrites our +old `english_lang` data frame; remember that by default, like other `pandas` functions, `assign` does not +modify the original data frame directly! +```{code-cell} ipython3 +:tags: ["output_scroll"] +english_lang = english_lang.assign( + city_pops=[4098927, 5928040, 1392609, 1321426, 2463431] +) +english_lang +``` +> **Note**: Inserting data manually in this is generally very error-prone and is not recommended. +> We do it here to demonstrate another usage of `assign` that does not involve `apply`. +> But in more advanced data wrangling, +> one would solve this problem in a less error-prone way using +> the `merge` function, which lets you combine two data frames. We will show you an +> example using `merge` at the end of the chapter! + +Now we have a new column with the population for each city. Finally, we calculate the +proportion of people who speak English the most at home by taking the ratio of the columns +`most_at_home` and `city_pops`. We will again add this to our data frame using `assign`. +```{code-cell} ipython3 +:tags: ["output_scroll"] +english_lang.assign( + proportion=english_lang["most_at_home"]/english_lang["city_pops"] +) ``` -To accomplish such a task, we can use `.apply`. -This works in a similar way for column selection, -as we saw when we used in Section {ref}`apply-summary` earlier. -As we did above, -we again use `.iloc` to specify the columns -as well as the `.apply` to specify the function we want to apply on these columns. -However, a key difference here is that we are not using aggregating function here, -which means that we get back a data frame with the same number of rows. - -```{code-cell} ipython3 ---- -jupyter: - source_hidden: true -tags: [remove-cell] ---- -# To accomplish such a task, we can use `mutate` paired with `across`. \index{across} -# This works in a similar way for column selection, -# as we saw when we used `summarize` + `across` earlier. -# As we did above, -# we again use `across` to specify the columns using `select` syntax -# as well as the function we want to apply on the specified columns. -# However, a key difference here is that we are using `mutate`, -# which means that we get back a data frame with the same number of rows. -``` -```{code-cell} ipython3 -region_lang.dtypes -``` ++++ -```{code-cell} ipython3 -region_lang_int32 = region_lang.iloc[:, 3:].apply(lambda col: col.astype('int32'), axis=0) -region_lang_int32 = pd.concat((region_lang.iloc[:, :3], region_lang_int32), axis=1) -region_lang_int32 -``` -```{code-cell} ipython3 -region_lang_int32.dtypes -``` +### Using `assign` to modify columns -We see that we get back a data frame -with the same number of columns and rows. -The only thing that changes is the transformation we applied -to the specified columns (here `mother_tongue` to `lang_known`). -+++ +In the section on {ref}`str-split`, +when we first read in the `"region_lang_top5_cities_messy.csv"` data, +all of the variables were "object" data types. +During the tidying process, +we used the `pandas.to_numeric` function +to convert the `most_at_home` and `most_at_work` columns +to the desired integer (i.e., numeric class) data types and then used `[]` to overwrite columns. +We can do the same thing using `assign`. + +Below we use `assign` to convert the columns `most_at_home` and `most_at_work` +to numeric data types in the `official_langs` data set as described in +{numref}`fig:img-assign`. In our example, we are naming the columns the same +names as columns that already exist in the data frame +(`"most_at_home"`, `"most_at_work"`) +and this will cause `assign` to *overwrite* those columns +(also referred to as modifying those columns *in-place*). +If we were to give the columns a new name, +then `assign` would create new columns with the names we specified. +The syntax is detailed in {numref}`fig:img-assign`. -## Apply functions across columns within one row with `.apply` +```{code-cell} ipython3 +:tags: ["output_scroll"] +official_langs_numeric = official_langs.assign( + most_at_home=pd.to_numeric(official_langs["most_at_home"]), + most_at_work=pd.to_numeric(official_langs["most_at_work"]), +) -What if you want to apply a function across columns but within one row? -We illustrate such a data transformation in {numref}`fig:rowwise`. +official_langs_numeric +``` +++ {"tags": []} -```{figure} img/summarize/summarize.004.jpeg -:name: fig:rowwise -:figclass: caption-hack +```{figure} img/wrangling/pandas_assign_args_labels.png +:name: fig:img-assign +:figclass: figure -`.apply` is useful for applying functions across columns within one row. The darker, top row of each table represents the column headers. +Syntax for the `assign` function. ``` +++ -For instance, suppose we want to know the maximum value between `mother_tongue`, -`most_at_home`, `most_at_work` -and `lang_known` for each language and region -in the `region_lang` data set. -In other words, we want to apply the `max` function *row-wise.* -Before we use `.apply`, we will again use `.iloc` to select only the count columns -so we can see all the columns in the data frame's output easily in the book. -So for this demonstration, the data set we are operating on looks like this: ```{code-cell} ipython3 ---- -jupyter: - source_hidden: true -tags: [remove-cell] ---- -# For instance, suppose we want to know the maximum value between `mother_tongue`, -# `most_at_home`, `most_at_work` -# and `lang_known` for each language and region -# in the `region_lang` data set. -# In other words, we want to apply the `max` function *row-wise.* -# We will use the (aptly named) `rowwise` function in combination with `mutate` -# to accomplish this task. - -# Before we apply `rowwise`, we will `select` only the count columns \index{rowwise} -# so we can see all the columns in the data frame's output easily in the book. -# So for this demonstration, the data set we are operating on looks like this: +official_langs_numeric.info() ``` +Now we see that the `most_at_home` and `most_at_work` columns are both `int64` (which is a numeric data type)! +Note that we were careful here and created a new data frame object `official_langs_numeric`. Since `assign` has +the power to overwrite the entries of a column, it is a good idea to create a new data frame object so that if +you make a mistake, you can start again from the original data frame. + ++++ + + +### Using `assign` to create a new data frame + ```{code-cell} ipython3 -region_lang.iloc[:, 3:] -``` +:tags: [remove-cell] -Now we use `.apply` with argument `axis=1`, to tell Python that we would like -the `max` function to be applied across, and within, a row, -as opposed to being applied on a column -(which is the default behavior of `.apply`): +english_lang = region_lang[region_lang["language"] == "English"] +five_cities = ["Toronto", "MontrĆ©al", "Vancouver", "Calgary", "Edmonton"] +english_lang = english_lang[english_lang["region"].isin(five_cities)] +english_lang +``` +Sometimes you want to create a new data frame. You can use `assign` to create a data frame from scratch. +Lets return to the example of wanting to compute the proportions of people who speak English +most at home in Toronto, MontrĆ©al, Vancouver, Calgary, Edmonton. Before adding new columns, we filtered +our `region_lang` to create the `english_lang` data frame containing only English speakers in the five cities +of interest. ```{code-cell} ipython3 ---- -jupyter: - source_hidden: true -tags: [remove-cell] ---- -# Now we apply `rowwise` before `mutate`, to tell R that we would like -# the mutate function to be applied across, and within, a row, -# as opposed to being applied on a column -# (which is the default behavior of `mutate`): +:tags: ["output_scroll"] +english_lang ``` +We then wanted to add the populations of these cities as a column using `assign` +(Toronto: 5928040, MontrĆ©al: 4098927, Vancouver: 2463431, +Calgary: 1392609, and Edmonton: 1321426). We had to be careful to add those populations in the +right order, and it could be easy to make a mistake this way. An alternative approach, that we demonstrate here +is to (1) create a new, empty data frame, (2) use `assign` to assign the city names and populations in that +data frame, and (3) use `merge` to combine the two data frames, recognizing that the "regions" are the same. +We create a new, empty data frame by calling `pd.DataFrame` with no arguments. +We then use `assign` to add the city names in a column called `"region"` +and their populations in a column called `"population"`. ```{code-cell} ipython3 -region_lang_rowwise = region_lang.assign( - maximum=region_lang.iloc[:, 3:].apply(max, axis=1) +city_populations = pd.DataFrame().assign( + region=["Toronto", "MontrĆ©al", "Vancouver", "Calgary", "Edmonton"], + population=[5928040, 4098927, 2463431, 1392609, 1321426] ) - -region_lang_rowwise +city_populations ``` - -We see that we get an additional column added to the data frame, -named `maximum`, which is the maximum value between `mother_tongue`, -`most_at_home`, `most_at_work` and `lang_known` for each language -and region. - +This new data frame has the same `region` column as the `english_lang` data frame. The order of +the cities is different, but that is okay! We can use the `merge` function in `pandas` to say +we would like to combine the two data frames by matching the `region` between them. The argument +`on="region"` tells pandas we would like to use the `region` column to match up the entries. ```{code-cell} ipython3 ---- -jupyter: - source_hidden: true -tags: [remove-cell] ---- -# Similar to `group_by`, -# `rowwise` doesn't appear to do anything when it is called by itself. -# However, we can apply `rowwise` in combination -# with other functions to change how these other functions operate on the data. -# Notice if we used `mutate` without `rowwise`, -# we would have computed the maximum value across *all* rows -# rather than the maximum value for *each* row. -# Below we show what would have happened had we not used -# `rowwise`. In particular, the same maximum value is reported -# in every single row; this code does not provide the desired result. - -# ```{r} -# region_lang |> -# select(mother_tongue:lang_known) |> -# mutate(maximum = max(c(mother_tongue, -# most_at_home, -# most_at_home, -# lang_known))) -# ``` +:tags: ["output_scroll"] +english_lang = english_lang.merge(city_populations, on="region") +english_lang ``` +You can see that the populations for each city are correct (e.g. MontrĆ©al: 4098927, Toronto: 5928040), +and we could proceed to with our analysis from here. ## Summary -Cleaning and wrangling data can be a very time-consuming process. However, +Cleaning and wrangling data can be a very time-consuming process. However, it is a critical step in any data analysis. We have explored many different -functions for cleaning and wrangling data into a tidy format. -{numref}`tab:summary-functions-table` summarizes some of the key wrangling -functions we learned in this chapter. In the following chapters, you will -learn how you can take this tidy data and do so much more with it to answer your +functions for cleaning and wrangling data into a tidy format. +{numref}`tab:summary-functions-table` summarizes some of the key wrangling +functions we learned in this chapter. In the following chapters, you will +learn how you can take this tidy data and do so much more with it to answer your burning data science questions! +++ -```{table} Summary of wrangling functions +```{table} Summary of wrangling functions :name: tab:summary-functions-table | Function | Description | -| --- | ----------- | -| `.agg` | calculates aggregated summaries of inputs | -| `.apply` | allows you to apply function(s) to multiple columns/rows | -| `.assign` | adds or modifies columns in a data frame | -| `.groupby` | allows you to apply function(s) to groups of rows | -| `.iloc` | subsets columns/rows of a data frame using integer indices | -| `.loc` | subsets columns/rows of a data frame using labels | -| `.melt` | generally makes the data frame longer and narrower | -| `.pivot` | generally makes a data frame wider and decreases the number of rows | -| `.str.split` | splits up a string column into multiple columns | -``` - -```{code-cell} ipython3 ---- -jupyter: - source_hidden: true -tags: [remove-cell] ---- -# ## Summary - -# Cleaning and wrangling data can be a very time-consuming process. However, -# it is a critical step in any data analysis. We have explored many different -# functions for cleaning and wrangling data into a tidy format. -# Table \@ref(tab:summary-functions-table) summarizes some of the key wrangling -# functions we learned in this chapter. In the following chapters, you will -# learn how you can take this tidy data and do so much more with it to answer your -# burning data science questions! - -# \newpage - -# Table: (#tab:summary-functions-table) Summary of wrangling functions - -# | Function | Description | -# | --- | ----------- | -# | `across` | allows you to apply function(s) to multiple columns | -# | `filter` | subsets rows of a data frame | -# | `group_by` | allows you to apply function(s) to groups of rows | -# | `mutate` | adds or modifies columns in a data frame | -# | `map` | general iteration function | -# | `pivot_longer` | generally makes the data frame longer and narrower | -# | `pivot_wider` | generally makes a data frame wider and decreases the number of rows | -# | `rowwise` | applies functions across columns within one row | -# | `separate` | splits up a character column into multiple columns | -# | `select` | subsets columns of a data frame | -# | `summarize` | calculates summaries of inputs | +| --- | ----------- | +| `agg` | calculates aggregated summaries of inputs | +| `apply` | allows you to apply function(s) to multiple columns/rows | +| `assign` | adds or modifies columns in a data frame | +| `groupby` | allows you to apply function(s) to groups of rows | +| `iloc` | subsets columns/rows of a data frame using integer indices | +| `loc` | subsets columns/rows of a data frame using labels | +| `melt` | generally makes the data frame longer and narrower | +| `merge` | combine two data frames | +| `pivot` | generally makes a data frame wider and decreases the number of rows | +| `str.split` | splits up a string column into multiple columns | ``` ## Exercises -Practice exercises for the material covered in this chapter -can be found in the accompanying -[worksheets repository](https://github.com/UBC-DSCI/data-science-a-first-intro-worksheets#readme) +Practice exercises for the material covered in this chapter +can be found in the accompanying +[worksheets repository](https://github.com/UBC-DSCI/data-science-a-first-intro-python-worksheets#readme) in the "Cleaning and wrangling data" row. You can launch an interactive version of the worksheet in your browser by clicking the "launch binder" button. You can also preview a non-interactive version of the worksheet by clicking "view worksheet." If you instead decide to download the worksheet and run it on your own machine, make sure to follow the instructions for computer setup -found in Chapter {ref}`move-to-your-own-machine`. This will ensure that the automated feedback +found in the chapter on {ref}`move-to-your-own-machine`. This will ensure that the automated feedback and guidance that the worksheets provide will function as intended. +++ {"tags": []} -## Additional resources +## Additional resources - The [`pandas` package documentation](https://pandas.pydata.org/docs/reference/index.html) is another resource to learn more about the functions in this @@ -2417,58 +1805,15 @@ and guidance that the worksheets provide will function as intended. - *Python for Data Analysis* {cite:p}`mckinney2012python` has a few chapters related to data wrangling that go into more depth than this book. For example, the [data wrangling chapter](https://wesmckinney.com/book/data-wrangling.html) covers tidy data, - `.melt` and `.pivot`, but also covers missing values - and additional wrangling functions (like `.stack`). The [data + `melt` and `pivot`, but also covers missing values + and additional wrangling functions (like `stack`). The [data aggregation chapter](https://wesmckinney.com/book/data-aggregation.html) covers - `.groupby`, aggregating functions, `.apply`, etc. + `groupby`, aggregating functions, `apply`, etc. - You will occasionally encounter a case where you need to iterate over items in a data frame, but none of the above functions are flexible enough to do what you want. In that case, you may consider using [a for loop](https://wesmckinney.com/book/python-basics.html#control_for) {cite:p}`mckinney2012python`. -```{code-cell} ipython3 ---- -jp-MarkdownHeadingCollapsed: true -jupyter: - source_hidden: true -tags: [remove-cell] ---- -# ## Additional resources - -# - As we mentioned earlier, `tidyverse` is actually an *R -# meta package*: it installs and loads a collection of R packages that all -# follow the tidy data philosophy we discussed above. One of the `tidyverse` -# packages is `dplyr`—a data wrangling workhorse. You have already met many -# of `dplyr`'s functions -# (`select`, `filter`, `mutate`, `arrange`, `summarize`, and `group_by`). -# To learn more about these functions and meet a few more useful -# functions, we recommend you check out Chapters 5-9 of the [STAT545 online notes](https://stat545.com/). -# of the data wrangling, exploration, and analysis with R book. -# - The [`dplyr` R package documentation](https://dplyr.tidyverse.org/) [@dplyr] is -# another resource to learn more about the functions in this -# chapter, the full set of arguments you can use, and other related functions. -# The site also provides a very nice cheat sheet that summarizes many of the -# data wrangling functions from this chapter. -# - Check out the [`tidyselect` R package page](https://tidyselect.r-lib.org/index.html) -# [@tidyselect] for a comprehensive list of `select` helpers. -# These helpers can be used to choose columns in a data frame when paired with the `select` function -# (and other functions that use the `tidyselect` syntax, such as `pivot_longer`). -# The [documentation for `select` helpers](https://tidyselect.r-lib.org/reference/select_helpers.html) -# is a useful reference to find the helper you need for your particular problem. -# - *R for Data Science* [@wickham2016r] has a few chapters related to -# data wrangling that go into more depth than this book. For example, the -# [tidy data chapter](https://r4ds.had.co.nz/tidy-data.html) covers tidy data, -# `pivot_longer`/`pivot_wider` and `separate`, but also covers missing values -# and additional wrangling functions (like `unite`). The [data -# transformation chapter](https://r4ds.had.co.nz/transform.html) covers -# `select`, `filter`, `arrange`, `mutate`, and `summarize`. And the [`map` -# functions chapter](https://r4ds.had.co.nz/iteration.html#the-map-functions) -# provides more about the `map` functions. -# - You will occasionally encounter a case where you need to iterate over items -# in a data frame, but none of the above functions are flexible enough to do -# what you want. In that case, you may consider using [a for -# loop](https://r4ds.had.co.nz/iteration.html#iteration). -``` ## References @@ -2476,4 +1821,4 @@ tags: [remove-cell] ```{bibliography} :filter: docname in docnames -``` \ No newline at end of file +``` diff --git a/unused/install/conda-linux-64.lock b/unused/install/conda-linux-64.lock deleted file mode 100644 index ba3c46fd..00000000 --- a/unused/install/conda-linux-64.lock +++ /dev/null @@ -1,270 +0,0 @@ -# Generated by conda-lock. -# platform: linux-64 -# input_hash: 89f580fffc52744967507b06c57aba086f847ee041e119e07deb7ff6508a1608 -@EXPLICIT -https://conda.anaconda.org/t//conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81 -https://conda.anaconda.org/t//conda-forge/linux-64/ca-certificates-2021.10.8-ha878542_0.tar.bz2#575611b8a84f45960e87722eeb51fa26 -https://conda.anaconda.org/t//conda-forge/linux-64/ld_impl_linux-64-2.36.1-hea4e1c9_2.tar.bz2#bd4f2e711b39af170e7ff15163fe87ee -https://conda.anaconda.org/t//conda-forge/linux-64/libgfortran5-11.2.0-h5c6108e_16.tar.bz2#ff034874d96195a5c5be34200689b5b7 -https://conda.anaconda.org/t//conda-forge/linux-64/libstdcxx-ng-11.2.0-he4da1e4_16.tar.bz2#8cfd1cd3273ff187be91b868ddf9a636 -https://conda.anaconda.org/t//conda-forge/linux-64/pandoc-2.18-ha770c72_0.tar.bz2#518b07342786b362238d22f76789ed59 -https://conda.anaconda.org/t//conda-forge/noarch/pybind11-abi-4-hd8ed1ab_3.tar.bz2#878f923dd6acc8aeb47a75da6c4098be -https://conda.anaconda.org/t//conda-forge/noarch/tzdata-2022a-h191b570_0.tar.bz2#84be5301069417a2221187d2f435e0f7 -https://conda.anaconda.org/t//conda-forge/linux-64/libgfortran-ng-11.2.0-h69a702a_16.tar.bz2#27974aad841c189854df09426b1b9fac -https://conda.anaconda.org/t//conda-forge/linux-64/libgomp-11.2.0-h1d223b6_16.tar.bz2#e935fb0c92c6ffb63c736a2012604d72 -https://conda.anaconda.org/t//conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2#73aaf86a425cc6e73fcf236a5a46396d -https://conda.anaconda.org/t//conda-forge/linux-64/libgcc-ng-11.2.0-h1d223b6_16.tar.bz2#71feb63a30085cbce51847d5ef1f769d -https://conda.anaconda.org/t//conda-forge/linux-64/bzip2-1.0.8-h7f98852_4.tar.bz2#a1fd65c7ccbf10880423d82bca54eb54 -https://conda.anaconda.org/t//conda-forge/linux-64/c-ares-1.18.1-h7f98852_0.tar.bz2#f26ef8098fab1f719c91eb760d63381a -https://conda.anaconda.org/t//conda-forge/linux-64/expat-2.4.8-h27087fc_0.tar.bz2#e1b07832504eeba765d648389cc387a9 -https://conda.anaconda.org/t//conda-forge/linux-64/icu-70.1-h27087fc_0.tar.bz2#87473a15119779e021c314249d4b4aed -https://conda.anaconda.org/t//conda-forge/linux-64/keyutils-1.6.1-h166bdaf_0.tar.bz2#30186d27e2c9fa62b45fb1476b7200e3 -https://conda.anaconda.org/t//conda-forge/linux-64/libev-4.33-h516909a_1.tar.bz2#6f8720dff19e17ce5d48cfe7f3d2f0a3 -https://conda.anaconda.org/t//conda-forge/linux-64/libffi-3.4.2-h7f98852_5.tar.bz2#d645c6d2ac96843a2bfaccd2d62b3ac3 -https://conda.anaconda.org/t//conda-forge/linux-64/libiconv-1.16-h516909a_0.tar.bz2#5c0f338a513a2943c659ae619fca9211 -https://conda.anaconda.org/t//conda-forge/linux-64/libnsl-2.0.0-h7f98852_0.tar.bz2#39b1328babf85c7c3a61636d9cd50206 -https://conda.anaconda.org/t//conda-forge/linux-64/libopenblas-0.3.20-pthreads_h78a6416_0.tar.bz2#9b6d0781953c9e353faee494336cc229 -https://conda.anaconda.org/t//conda-forge/linux-64/libsodium-1.0.18-h36c2ea0_1.tar.bz2#c3788462a6fbddafdb413a9f9053e58d -https://conda.anaconda.org/t//conda-forge/linux-64/libuuid-2.32.1-h7f98852_1000.tar.bz2#772d69f030955d9646d3d0eaf21d859d -https://conda.anaconda.org/t//conda-forge/linux-64/libuv-1.43.0-h7f98852_0.tar.bz2#b34d856aa7e06ebd79bded72ef4afc16 -https://conda.anaconda.org/t//conda-forge/linux-64/libzlib-1.2.11-h166bdaf_1014.tar.bz2#757138ba3ddc6777b82e91d9ff62e7b9 -https://conda.anaconda.org/t//conda-forge/linux-64/lz4-c-1.9.3-h9c3ff4c_1.tar.bz2#fbe97e8fa6f275d7c76a09e795adc3e6 -https://conda.anaconda.org/t//conda-forge/linux-64/lzo-2.10-h516909a_1000.tar.bz2#bb14fcb13341b81d5eb386423b9d2bac -https://conda.anaconda.org/t//conda-forge/linux-64/ncurses-6.3-h27087fc_1.tar.bz2#4acfc691e64342b9dae57cf2adc63238 -https://conda.anaconda.org/t//conda-forge/linux-64/openssl-1.1.1o-h166bdaf_0.tar.bz2#6172048796b123e542945d998f5150b7 -https://conda.anaconda.org/t//conda-forge/linux-64/pcre-8.45-h9c3ff4c_0.tar.bz2#c05d1820a6d34ff07aaaab7a9b7eddaa -https://conda.anaconda.org/t//conda-forge/linux-64/reproc-14.2.3-h7f98852_0.tar.bz2#1e16d4142b016b6a5ebdeb3d6d33aaf4 -https://conda.anaconda.org/t//conda-forge/linux-64/xz-5.2.5-h516909a_1.tar.bz2#33f601066901f3e1a85af3522a8113f9 -https://conda.anaconda.org/t//conda-forge/linux-64/yaml-0.2.5-h7f98852_2.tar.bz2#4cb3ad778ec2d5a7acbdf254eb1c42ae -https://conda.anaconda.org/t//conda-forge/linux-64/yaml-cpp-0.6.3-he1b5a44_4.tar.bz2#8e873da49d14b584bed5a09084a68136 -https://conda.anaconda.org/t//conda-forge/linux-64/gettext-0.19.8.1-h73d1719_1008.tar.bz2#af49250eca8e139378f8ff0ae9e57251 -https://conda.anaconda.org/t//conda-forge/linux-64/libblas-3.9.0-14_linux64_openblas.tar.bz2#fb31fbbde682414550bbe15e3964420f -https://conda.anaconda.org/t//conda-forge/linux-64/libedit-3.1.20191231-he28a2e2_2.tar.bz2#4d331e44109e3f0e19b4cb8f9b82f3e1 -https://conda.anaconda.org/t//conda-forge/linux-64/libsolv-0.7.22-h6239696_0.tar.bz2#461963bb499e58bae159a898600f8792 -https://conda.anaconda.org/t//conda-forge/linux-64/perl-5.32.1-2_h7f98852_perl5.tar.bz2#09ba115862623f00962e9809ea248f1a -https://conda.anaconda.org/t//conda-forge/linux-64/readline-8.1-h46c0cb4_0.tar.bz2#5788de3c8d7a7d64ac56c784c4ef48e6 -https://conda.anaconda.org/t//conda-forge/linux-64/reproc-cpp-14.2.3-h9c3ff4c_0.tar.bz2#1fc15d3b393b62192d3eeade92b61610 -https://conda.anaconda.org/t//conda-forge/linux-64/tk-8.6.12-h27826a3_0.tar.bz2#5b8c42eb62e9fc961af70bdd6a26e168 -https://conda.anaconda.org/t//conda-forge/linux-64/zeromq-4.3.4-h9c3ff4c_1.tar.bz2#21743a8d2ea0c8cfbbf8fe489b0347df -https://conda.anaconda.org/t//conda-forge/linux-64/zlib-1.2.11-h166bdaf_1014.tar.bz2#def3b82d1a03aa695bb38ac1dd072ff2 -https://conda.anaconda.org/t//conda-forge/linux-64/zstd-1.5.2-ha95c52a_0.tar.bz2#5222b231b1ef49a7f60d40b363469b70 -https://conda.anaconda.org/t//conda-forge/linux-64/krb5-1.19.3-h3790be6_0.tar.bz2#7d862b05445123144bec92cb1acc8ef8 -https://conda.anaconda.org/t//conda-forge/linux-64/libcblas-3.9.0-14_linux64_openblas.tar.bz2#1b41ea4c32014d878e84de4e5690df7a -https://conda.anaconda.org/t//conda-forge/linux-64/libglib-2.70.2-h174f98d_4.tar.bz2#d44314ffae96b17657fbf3f8e47b04fc -https://conda.anaconda.org/t//conda-forge/linux-64/liblapack-3.9.0-14_linux64_openblas.tar.bz2#13367ebd0243a949cee7564b13c3cd42 -https://conda.anaconda.org/t//conda-forge/linux-64/libnghttp2-1.47.0-h727a467_0.tar.bz2#a22567abfea169ff8048506b1ca9b230 -https://conda.anaconda.org/t//conda-forge/linux-64/libssh2-1.10.0-ha56f1ee_2.tar.bz2#6ab4eaa11ff01801cffca0a27489dc04 -https://conda.anaconda.org/t//conda-forge/linux-64/libxml2-2.9.14-h22db469_0.tar.bz2#7d623237b73d93dd856b5dd0f5fedd6b -https://conda.anaconda.org/t//conda-forge/linux-64/nodejs-17.9.0-h96d913c_0.tar.bz2#760b5a71dbaaf438e83824f55b2dbb9e -https://conda.anaconda.org/t//conda-forge/linux-64/pcre2-10.37-h032f7d1_0.tar.bz2#6469e4602e914febe6f057ad2271a54e -https://conda.anaconda.org/t//conda-forge/linux-64/sqlite-3.38.5-h4ff8645_0.tar.bz2#a1448f0c31baec3946d2dcf09f905c9e -https://conda.anaconda.org/t//conda-forge/linux-64/dbus-1.13.6-h5008d03_3.tar.bz2#ecfff944ba3960ecb334b9a2663d708d -https://conda.anaconda.org/t//conda-forge/linux-64/libarchive-3.5.2-hccf745f_1.tar.bz2#c777ce221e0f3f1aade66074405d042e -https://conda.anaconda.org/t//conda-forge/linux-64/libcurl-7.83.0-h7bff187_0.tar.bz2#e2d939fa77fe69cd50f751961f17786a -https://conda.anaconda.org/t//conda-forge/linux-64/libxslt-1.1.33-h8affb1d_4.tar.bz2#47437b917a43a9650f0d14d7a54753a4 -https://conda.anaconda.org/t//conda-forge/linux-64/python-3.9.12-h9a8a25e_1_cpython.tar.bz2#06dadf5df9d340439c2aa32e15099d31 -https://conda.anaconda.org/t//conda-forge/noarch/alabaster-0.7.12-py_0.tar.bz2#2489a97287f90176ecdc3ca982b4b0a0 -https://conda.anaconda.org/t//conda-forge/noarch/appdirs-1.4.4-pyh9f0ad1d_0.tar.bz2#5f095bc6454094e96f146491fd03633b -https://conda.anaconda.org/t//conda-forge/noarch/argh-0.26.2-pyh9f0ad1d_1002.tar.bz2#0af89261f0352895e1c1000d306b3dc7 -https://conda.anaconda.org/t//conda-forge/noarch/attrs-21.4.0-pyhd8ed1ab_0.tar.bz2#f70280205d7044c8b8358c8de3190e5d -https://conda.anaconda.org/t//conda-forge/noarch/backcall-0.2.0-pyh9f0ad1d_0.tar.bz2#6006a6d08a3fa99268a2681c7fb55213 -https://conda.anaconda.org/t//conda-forge/noarch/backports-1.0-py_2.tar.bz2#0da16b293affa6ac31812376f8eb79dd -https://conda.anaconda.org/t//conda-forge/noarch/cachy-0.3.0-py_0.tar.bz2#808c46dc56ae4a796830129aaf1b51ec -https://conda.anaconda.org/t//conda-forge/noarch/charset-normalizer-2.0.12-pyhd8ed1ab_0.tar.bz2#1f5b32dabae0f1893ae3283dac7f799e -https://conda.anaconda.org/t//conda-forge/noarch/colorama-0.4.4-pyh9f0ad1d_0.tar.bz2#c08b4c1326b880ed44f3ffb04803332f -https://conda.anaconda.org/t//conda-forge/noarch/crashtest-0.3.1-pyhd8ed1ab_0.tar.bz2#b8477552274c1cfdb533e954c76523f1 -https://conda.anaconda.org/t//conda-forge/linux-64/curl-7.83.0-h7bff187_0.tar.bz2#81e39fb3ae82be7e8d2dd7046f393588 -https://conda.anaconda.org/t//conda-forge/noarch/dataclasses-0.8-pyhc8e2a94_3.tar.bz2#a362b2124b06aad102e2ee4581acee7d -https://conda.anaconda.org/t//conda-forge/noarch/decorator-5.1.1-pyhd8ed1ab_0.tar.bz2#43afe5ab04e35e17ba28649471dd7364 -https://conda.anaconda.org/t//conda-forge/noarch/defusedxml-0.7.1-pyhd8ed1ab_0.tar.bz2#961b3a227b437d82ad7054484cfa71b2 -https://conda.anaconda.org/t//conda-forge/noarch/distlib-0.3.4-pyhd8ed1ab_0.tar.bz2#7b50d840543d9cdae100e91582c33035 -https://conda.anaconda.org/t//conda-forge/noarch/entrypoints-0.4-pyhd8ed1ab_0.tar.bz2#3cf04868fee0a029769bd41f4b2fbf2d -https://conda.anaconda.org/t//conda-forge/noarch/executing-0.8.3-pyhd8ed1ab_0.tar.bz2#8d70f4543c1f701b946f85e9f9a00800 -https://conda.anaconda.org/t//conda-forge/noarch/filelock-3.6.0-pyhd8ed1ab_0.tar.bz2#6e03ca6c7b47a4152a2b12c6eee3bd32 -https://conda.anaconda.org/t//conda-forge/noarch/flit-core-3.7.1-pyhd8ed1ab_0.tar.bz2#f93822cba5c20161560661988a88f2c0 -https://conda.anaconda.org/t//conda-forge/noarch/idna-3.3-pyhd8ed1ab_0.tar.bz2#40b50b8b030f5f2f22085c062ed013dd -https://conda.anaconda.org/t//conda-forge/noarch/imagesize-1.3.0-pyhd8ed1ab_0.tar.bz2#be807e7606fff9436e5e700f6bffb7c6 -https://conda.anaconda.org/t//conda-forge/noarch/ipython_genutils-0.2.0-py_1.tar.bz2#5071c982548b3a20caf70462f04f5287 -https://conda.anaconda.org/t//conda-forge/noarch/jeepney-0.8.0-pyhd8ed1ab_0.tar.bz2#9800ad1699b42612478755a2d26c722d -https://conda.anaconda.org/t//conda-forge/noarch/json5-0.9.5-pyh9f0ad1d_0.tar.bz2#10759827a94e6b14996e81fb002c0bda -https://conda.anaconda.org/t//conda-forge/noarch/jupyterlab_widgets-1.1.0-pyhd8ed1ab_0.tar.bz2#e963a4a39cf442dbe5503f66edda083d -https://conda.anaconda.org/t//conda-forge/linux-64/libmamba-0.23.0-hd8a31e3_1.tar.bz2#f2493e48d81be2e9d0e4d4e719e31e08 -https://conda.anaconda.org/t//conda-forge/noarch/lockfile-0.12.2-py_1.tar.bz2#c104d98e09c47519950cffb8dd5b4f10 -https://conda.anaconda.org/t//conda-forge/noarch/nest-asyncio-1.5.5-pyhd8ed1ab_0.tar.bz2#dc36c992aec485c0efff619ed2e63957 -https://conda.anaconda.org/t//conda-forge/noarch/pandocfilters-1.5.0-pyhd8ed1ab_0.tar.bz2#457c2c8c08e54905d6954e79cb5b5db9 -https://conda.anaconda.org/t//conda-forge/noarch/parso-0.8.3-pyhd8ed1ab_0.tar.bz2#17a565a0c3899244e938cdf417e7b094 -https://conda.anaconda.org/t//conda-forge/noarch/pastel-0.2.1-pyhd8ed1ab_0.tar.bz2#a4eea5bff523f26442405bc5d1f52adb -https://conda.anaconda.org/t//conda-forge/noarch/pickleshare-0.7.5-py_1003.tar.bz2#415f0ebb6198cc2801c73438a9fb5761 -https://conda.anaconda.org/t//conda-forge/noarch/pkginfo-1.8.2-pyhd8ed1ab_0.tar.bz2#c776a1cd5745674c28c20a5498cafa89 -https://conda.anaconda.org/t//conda-forge/noarch/platformdirs-2.5.1-pyhd8ed1ab_0.tar.bz2#d5df87964a39f67c46a5448f4e78d9b6 -https://conda.anaconda.org/t//conda-forge/noarch/prometheus_client-0.14.1-pyhd8ed1ab_0.tar.bz2#b7fa7d86530b8de805268e48988eb483 -https://conda.anaconda.org/t//conda-forge/noarch/ptyprocess-0.7.0-pyhd3deb0d_0.tar.bz2#359eeb6536da0e687af562ed265ec263 -https://conda.anaconda.org/t//conda-forge/noarch/pure_eval-0.2.2-pyhd8ed1ab_0.tar.bz2#6784285c7e55cb7212efabc79e4c2883 -https://conda.anaconda.org/t//conda-forge/noarch/pycparser-2.21-pyhd8ed1ab_0.tar.bz2#076becd9e05608f8dc72757d5f3a91ff -https://conda.anaconda.org/t//conda-forge/noarch/pylev-1.4.0-pyhd8ed1ab_0.tar.bz2#edf8651c4379d9d1495ad6229622d150 -https://conda.anaconda.org/t//conda-forge/noarch/pyparsing-3.0.8-pyhd8ed1ab_0.tar.bz2#7f5738c49fdccd0fc755bfd25a5ea66c -https://conda.anaconda.org/t//conda-forge/noarch/python-fastjsonschema-2.15.3-pyhd8ed1ab_0.tar.bz2#fae309d1cc996da1f63de9d321e65e27 -https://conda.anaconda.org/t//conda-forge/linux-64/python_abi-3.9-2_cp39.tar.bz2#39adde4247484de2bb4000122fdcf665 -https://conda.anaconda.org/t//conda-forge/noarch/pytz-2022.1-pyhd8ed1ab_0.tar.bz2#b87d66d6d3991d988fb31510c95a9267 -https://conda.anaconda.org/t//conda-forge/noarch/send2trash-1.8.0-pyhd8ed1ab_0.tar.bz2#edab14119efe85c3bf131ad747e9005c -https://conda.anaconda.org/t//conda-forge/noarch/shellingham-1.4.0-pyh44b312d_0.tar.bz2#437655338696f9d0dfdb0a024e66b255 -https://conda.anaconda.org/t//conda-forge/noarch/six-1.16.0-pyh6c4a22f_0.tar.bz2#e5f25f8dbc060e9a8d912e432202afc2 -https://conda.anaconda.org/t//conda-forge/noarch/smmap-3.0.5-pyh44b312d_0.tar.bz2#3a8dc70789709aa315325d5df06fb7e4 -https://conda.anaconda.org/t//conda-forge/noarch/snowballstemmer-2.2.0-pyhd8ed1ab_0.tar.bz2#4d22a9315e78c6827f806065957d566e -https://conda.anaconda.org/t//conda-forge/noarch/soupsieve-2.3.1-pyhd8ed1ab_0.tar.bz2#d821b295c4bd18ad27e1e19543a5784a -https://conda.anaconda.org/t//conda-forge/noarch/sphinxcontrib-applehelp-1.0.2-py_0.tar.bz2#20b2eaeaeea4ef9a9a0d99770620fd09 -https://conda.anaconda.org/t//conda-forge/noarch/sphinxcontrib-devhelp-1.0.2-py_0.tar.bz2#68e01cac9d38d0e717cd5c87bc3d2cc9 -https://conda.anaconda.org/t//conda-forge/noarch/sphinxcontrib-htmlhelp-2.0.0-pyhd8ed1ab_0.tar.bz2#77dad82eb9c8c1525ff7953e0756d708 -https://conda.anaconda.org/t//conda-forge/noarch/sphinxcontrib-jsmath-1.0.1-py_0.tar.bz2#67cd9d9c0382d37479b4d306c369a2d4 -https://conda.anaconda.org/t//conda-forge/noarch/sphinxcontrib-qthelp-1.0.3-py_0.tar.bz2#d01180388e6d1838c3e1ad029590aa7a -https://conda.anaconda.org/t//conda-forge/noarch/sphinxcontrib-serializinghtml-1.1.5-pyhd8ed1ab_2.tar.bz2#9ff55a0901cf952f05c654394de76bf7 -https://conda.anaconda.org/t//conda-forge/noarch/threadpoolctl-3.1.0-pyh8a188c0_0.tar.bz2#a2995ee828f65687ac5b1e71a2ab1e0c -https://conda.anaconda.org/t//conda-forge/noarch/toml-0.10.2-pyhd8ed1ab_0.tar.bz2#f832c45a477c78bebd107098db465095 -https://conda.anaconda.org/t//conda-forge/noarch/toolz-0.11.2-pyhd8ed1ab_0.tar.bz2#f348d1590550371edfac5ed3c1d44f7e -https://conda.anaconda.org/t//conda-forge/noarch/traitlets-5.2.0-pyhd8ed1ab_0.tar.bz2#b81786ff00b93d07560ea21d98a2b266 -https://conda.anaconda.org/t//conda-forge/noarch/typing-3.10.0.0-pyhd8ed1ab_0.tar.bz2#e6573ac68718f17b9d4f5c8eda3190f2 -https://conda.anaconda.org/t//conda-forge/noarch/typing_extensions-4.2.0-pyha770c72_1.tar.bz2#f0f7e024f94e23d3bfee0ab777bf335a -https://conda.anaconda.org/t//conda-forge/noarch/uc-micro-py-1.0.1-pyhd8ed1ab_0.tar.bz2#3ddf6684d9b274a12c94e509ca45656c -https://conda.anaconda.org/t//conda-forge/noarch/webencodings-0.5.1-py_1.tar.bz2#3563be4c5611a44210d9ba0c16113136 -https://conda.anaconda.org/t//conda-forge/noarch/websocket-client-1.3.2-pyhd8ed1ab_0.tar.bz2#da6f472c62b4eda0caf05e223729efcd -https://conda.anaconda.org/t//conda-forge/noarch/wheel-0.37.1-pyhd8ed1ab_0.tar.bz2#1ca02aaf78d9c70d9a81a3bed5752022 -https://conda.anaconda.org/t//conda-forge/noarch/zipp-3.8.0-pyhd8ed1ab_0.tar.bz2#050b94cf4a8c760656e51d2d44e4632c -https://conda.anaconda.org/t//conda-forge/noarch/asttokens-2.0.5-pyhd8ed1ab_0.tar.bz2#74badce16f060701fee55c39332f5253 -https://conda.anaconda.org/t//conda-forge/noarch/babel-2.10.1-pyhd8ed1ab_0.tar.bz2#2ec70a4a964b696170d730466c668f60 -https://conda.anaconda.org/t//conda-forge/noarch/beautifulsoup4-4.11.1-pyha770c72_0.tar.bz2#eeec8814bd97b2681f708bb127478d7d -https://conda.anaconda.org/t//conda-forge/linux-64/certifi-2021.10.8-py39hf3d152e_2.tar.bz2#eb728d82a814f6f4d1a62db2422e004e -https://conda.anaconda.org/t//conda-forge/linux-64/cffi-1.15.0-py39h4bc2ebd_0.tar.bz2#f6191bf565dee581e77549d63737751c -https://conda.anaconda.org/t//conda-forge/linux-64/click-8.1.3-py39hf3d152e_0.tar.bz2#40edd9ebc04e4b4ec27c1008e5e3f99d -https://conda.anaconda.org/t//conda-forge/noarch/clikit-0.6.2-pyh9f0ad1d_0.tar.bz2#159273f717a11e53b2656f8b6521a5e2 -https://conda.anaconda.org/t//conda-forge/linux-64/debugpy-1.6.0-py39h5a03fae_0.tar.bz2#93ec11c7d7b7a2ff559f653dc9ca1e2b -https://conda.anaconda.org/t//conda-forge/linux-64/docutils-0.16-py39hf3d152e_3.tar.bz2#4f0fa7459a1f40a969aaad418b1c428c -https://conda.anaconda.org/t//conda-forge/linux-64/git-2.35.3-pl5321h36853c3_0.tar.bz2#a4033955267d65d8579d5af863641d97 -https://conda.anaconda.org/t//conda-forge/noarch/gitdb-4.0.9-pyhd8ed1ab_0.tar.bz2#40fc6b14a45dee3a3fd9f302d026108e -https://conda.anaconda.org/t//conda-forge/linux-64/greenlet-1.1.2-py39h5a03fae_2.tar.bz2#d7d11bc86d42472816cfb0f9f27da0ad -https://conda.anaconda.org/t//conda-forge/noarch/html5lib-1.1-pyh9f0ad1d_0.tar.bz2#b2355343d6315c892543200231d7154a -https://conda.anaconda.org/t//conda-forge/linux-64/importlib-metadata-4.11.3-py39hf3d152e_1.tar.bz2#fbcc5eab83e938242d70834ad70a8f20 -https://conda.anaconda.org/t//conda-forge/noarch/importlib_resources-5.7.1-pyhd8ed1ab_0.tar.bz2#8a50c32f48abec73bc3dd4df0d133892 -https://conda.anaconda.org/t//conda-forge/linux-64/jedi-0.18.1-py39hf3d152e_1.tar.bz2#941922f60b68bc6768c9360afb94d138 -https://conda.anaconda.org/t//conda-forge/linux-64/jupyter_core-4.9.2-py39hf3d152e_0.tar.bz2#f5b7cd7077a4aa49aa2615c50cf28e2c -https://conda.anaconda.org/t//conda-forge/noarch/latexcodec-2.0.1-pyh9f0ad1d_0.tar.bz2#8d67904973263afd2985ba56aa2d6bb4 -https://conda.anaconda.org/t//conda-forge/linux-64/libmambapy-0.23.0-py39hd55135b_1.tar.bz2#c724e99766ee68979062fc63ab6db058 -https://conda.anaconda.org/t//conda-forge/noarch/linkify-it-py-1.0.3-pyhd8ed1ab_0.tar.bz2#ba4b07f6a132c77eb69ede31a6ed790b -https://conda.anaconda.org/t//conda-forge/linux-64/lxml-4.8.0-py39hb9d737c_3.tar.bz2#b2e9b0f987bd242c80d7dfda003a2c39 -https://conda.anaconda.org/t//conda-forge/noarch/markdown-it-py-1.1.0-pyhd8ed1ab_0.tar.bz2#84e8dfb1a9e6a824f32fd45b867271ca -https://conda.anaconda.org/t//conda-forge/linux-64/markupsafe-2.1.1-py39hb9d737c_1.tar.bz2#7cda413e43b252044a270c2477031c5c -https://conda.anaconda.org/t//conda-forge/noarch/matplotlib-inline-0.1.3-pyhd8ed1ab_0.tar.bz2#be3bfd435802d2c768c6b2439f325f3d -https://conda.anaconda.org/t//conda-forge/linux-64/mistune-0.8.4-py39h3811e60_1005.tar.bz2#95eb8cbf40bccdcb34888c9e56371570 -https://conda.anaconda.org/t//conda-forge/linux-64/msgpack-python-1.0.3-py39hf939315_1.tar.bz2#9d47ff7dffb54ed6b10bd4e5087af505 -https://conda.anaconda.org/t//conda-forge/linux-64/numpy-1.22.3-py39hc58783e_2.tar.bz2#e682ad4e85c7fda7dd0f0283d3b2ae8e -https://conda.anaconda.org/t//conda-forge/noarch/packaging-20.9-pyh44b312d_0.tar.bz2#be69a38e912054a62dc82cc3c7711a64 -https://conda.anaconda.org/t//conda-forge/noarch/pexpect-4.8.0-pyh9f0ad1d_2.tar.bz2#5909e7b978141dd80d28dbf9de627827 -https://conda.anaconda.org/t//conda-forge/linux-64/poetry-core-1.0.8-py39hf3d152e_1.tar.bz2#bfefe349de77edb720cb4688821ff78e -https://conda.anaconda.org/t//conda-forge/linux-64/psutil-5.9.0-py39hb9d737c_1.tar.bz2#078ad072b9d417cbe620455a2a0e3394 -https://conda.anaconda.org/t//conda-forge/linux-64/pycosat-0.6.3-py39hb9d737c_1010.tar.bz2#b7d981539b1a880d19c6a158104a3fa1 -https://conda.anaconda.org/t//conda-forge/linux-64/pyrsistent-0.18.1-py39hb9d737c_1.tar.bz2#e2575d7508c7933047544ac7a15e021d -https://conda.anaconda.org/t//conda-forge/linux-64/pysocks-1.7.1-py39hf3d152e_5.tar.bz2#d34b97a2386932b97c7cb80916a673e7 -https://conda.anaconda.org/t//conda-forge/noarch/python-dateutil-2.8.2-pyhd8ed1ab_0.tar.bz2#dd999d1cc9f79e67dbb855c8924c7984 -https://conda.anaconda.org/t//conda-forge/linux-64/pyyaml-6.0-py39hb9d737c_4.tar.bz2#dcc47a3b751508507183d17e569805e5 -https://conda.anaconda.org/t//conda-forge/linux-64/pyzmq-22.3.0-py39headdf64_2.tar.bz2#79fce3734cb80a666837952b6a712ddb -https://conda.anaconda.org/t//conda-forge/linux-64/ruamel.yaml.clib-0.2.6-py39hb9d737c_1.tar.bz2#a0fabd69dd35bb24ec84d28dc01c3c5b -https://conda.anaconda.org/t//conda-forge/linux-64/ruamel_yaml-0.15.80-py39h3811e60_1006.tar.bz2#c9e96b53141c8b1bc214e6a90611e2ca -https://conda.anaconda.org/t//conda-forge/linux-64/setuptools-62.1.0-py39hf3d152e_0.tar.bz2#199fa7c3b8ea037543cce82563918a59 -https://conda.anaconda.org/t//conda-forge/linux-64/sniffio-1.2.0-py39hf3d152e_3.tar.bz2#e2cb114a39b27ef1687a0c2c3e793cf6 -https://conda.anaconda.org/t//conda-forge/noarch/tinycss2-1.1.1-pyhd8ed1ab_0.tar.bz2#5d280406501e79dc7aa9c9ac31d25a80 -https://conda.anaconda.org/t//conda-forge/noarch/tomlkit-0.10.2-pyha770c72_0.tar.bz2#482e5775f80665a7c9f76cd72a66eae8 -https://conda.anaconda.org/t//conda-forge/linux-64/tornado-6.1-py39hb9d737c_3.tar.bz2#5e13a2d214ed4184969df363a1aab420 -https://conda.anaconda.org/t//conda-forge/noarch/tqdm-4.64.0-pyhd8ed1ab_0.tar.bz2#6642233f341e1900d0c8e6eddb979c14 -https://conda.anaconda.org/t//conda-forge/noarch/typing-extensions-4.2.0-hd8ed1ab_1.tar.bz2#6d9d7480c5780514779967be2ee8b963 -https://conda.anaconda.org/t//conda-forge/linux-64/virtualenv-20.14.1-py39hf3d152e_0.tar.bz2#6d7e213edf6669391700946201a63bc0 -https://conda.anaconda.org/t//conda-forge/linux-64/anyio-3.5.0-py39hf3d152e_0.tar.bz2#b0d75a9d3fd02ec079504aabb7cd7ec3 -https://conda.anaconda.org/t//conda-forge/linux-64/argon2-cffi-bindings-21.2.0-py39hb9d737c_2.tar.bz2#76139de3552a2046135eb0b2d02a9c85 -https://conda.anaconda.org/t//conda-forge/noarch/backports.functools_lru_cache-1.6.4-pyhd8ed1ab_0.tar.bz2#c5b3edc62d6309088f4970b3eaaa65a6 -https://conda.anaconda.org/t//conda-forge/noarch/bleach-5.0.0-pyhd8ed1ab_0.tar.bz2#2a2ae7c56b8f72caba261363407b484a -https://conda.anaconda.org/t//conda-forge/linux-64/brotlipy-0.7.0-py39hb9d737c_1004.tar.bz2#05a99367d885ec9990f25e74128a8a08 -https://conda.anaconda.org/t//conda-forge/noarch/cleo-0.8.1-pyhd8ed1ab_2.tar.bz2#4c82b11a3d06031bd58e7d869f53d965 -https://conda.anaconda.org/t//conda-forge/noarch/click-default-group-1.2.2-pyhd8ed1ab_1.tar.bz2#72a46ffc25701c173932fd55cf0965d3 -https://conda.anaconda.org/t//conda-forge/noarch/click-log-0.3.2-pyh9f0ad1d_0.tar.bz2#3a64d156136fad977df1b81a24b57ac0 -https://conda.anaconda.org/t//conda-forge/linux-64/conda-package-handling-1.8.1-py39hb9d737c_1.tar.bz2#1fadb17b68893d479b0a01981570a494 -https://conda.anaconda.org/t//conda-forge/linux-64/cryptography-36.0.2-py39hd97740a_1.tar.bz2#dbd00b111b182f40ecf998c8289fc4a2 -https://conda.anaconda.org/t//conda-forge/noarch/ghp-import-2.1.0-pyhd8ed1ab_0.tar.bz2#6d8d61116031a3f5b1f32e7899785866 -https://conda.anaconda.org/t//conda-forge/noarch/gitpython-3.1.27-pyhd8ed1ab_0.tar.bz2#20acbaab17a50ac9b64138eb9a0e1af8 -https://conda.anaconda.org/t//conda-forge/noarch/importlib_metadata-4.11.3-hd8ed1ab_1.tar.bz2#bd6b6ae37c03e68061574d5e32fe5bd1 -https://conda.anaconda.org/t//conda-forge/noarch/jinja2-3.0.3-pyhd8ed1ab_0.tar.bz2#036d872c653780cb26e797e2e2f61b4c -https://conda.anaconda.org/t//conda-forge/noarch/joblib-1.1.0-pyhd8ed1ab_0.tar.bz2#07d1b5c8cde14d95998fd4767e1e62d2 -https://conda.anaconda.org/t//conda-forge/noarch/jsonschema-3.2.0-pyhd8ed1ab_3.tar.bz2#66125e28711d8ffc04a207a2b170316d -https://conda.anaconda.org/t//conda-forge/noarch/jupyter_client-7.3.1-pyhd8ed1ab_0.tar.bz2#38481a37ead8c37d2ad7b52d3bc2b0a7 -https://conda.anaconda.org/t//conda-forge/noarch/mdit-py-plugins-0.2.8-pyhd8ed1ab_0.tar.bz2#49236fcd746a124eb56d326f79e1d46d -https://conda.anaconda.org/t//conda-forge/linux-64/pandas-1.4.2-py39h1832856_1.tar.bz2#264505bcd299b8d564195cfb3e6038f0 -https://conda.anaconda.org/t//conda-forge/noarch/pip-22.0.4-pyhd8ed1ab_0.tar.bz2#b1239ce8ef2a1eec485c398a683c5bff -https://conda.anaconda.org/t//conda-forge/noarch/portpicker-1.5.0-pyhd8ed1ab_0.tar.bz2#5f9595b4b3f50a0d572b0a7c8b4293c7 -https://conda.anaconda.org/t//conda-forge/noarch/pybtex-0.24.0-pyhd8ed1ab_2.tar.bz2#2099b86a7399c44c0c61cdb6de6915ba -https://conda.anaconda.org/t//conda-forge/linux-64/pydantic-1.9.0-py39hb9d737c_1.tar.bz2#5e0f9261bada9bc3cbd4269240993f0b -https://conda.anaconda.org/t//conda-forge/noarch/pygments-2.12.0-pyhd8ed1ab_0.tar.bz2#cb27e2ded147e5bcc7eafc1c6d343cb3 -https://conda.anaconda.org/t//conda-forge/linux-64/ruamel.yaml-0.17.21-py39hb9d737c_1.tar.bz2#2b94cf785616198b112170b9838262a4 -https://conda.anaconda.org/t//conda-forge/linux-64/scipy-1.8.0-py39hee8e79c_1.tar.bz2#8cc32d8307f5a8eb319e242c61d259ec -https://conda.anaconda.org/t//conda-forge/linux-64/sqlalchemy-1.4.36-py39hb9d737c_0.tar.bz2#e432937ad2a6822f770c1a925a08200c -https://conda.anaconda.org/t//conda-forge/noarch/stack_data-0.2.0-pyhd8ed1ab_0.tar.bz2#8c0ce3e6bf18a0c810125aef58a2a6f3 -https://conda.anaconda.org/t//conda-forge/linux-64/terminado-0.13.3-py39hf3d152e_1.tar.bz2#bebf6da1adb04ed902ff335e61438a6e -https://conda.anaconda.org/t//conda-forge/linux-64/watchdog-2.1.7-py39hf3d152e_1.tar.bz2#1c40e7ce24039941617ef32b80e04114 -https://conda.anaconda.org/t//conda-forge/noarch/altair-4.2.0-pyhd8ed1ab_1.tar.bz2#2867acfe48ceb3630b163632914720d9 -https://conda.anaconda.org/t//conda-forge/noarch/argon2-cffi-21.3.0-pyhd8ed1ab_0.tar.bz2#a0b402db58f73aaab8ee0ca1025a362e -https://conda.anaconda.org/t//conda-forge/linux-64/click-completion-0.5.2-py39hf3d152e_3.tar.bz2#af5cc0d8d34180fca8b8fa7ba438c9b2 -https://conda.anaconda.org/t//conda-forge/noarch/jupyterlab_pygments-0.2.2-pyhd8ed1ab_0.tar.bz2#243f63592c8e449f40cd42eb5cf32f40 -https://conda.anaconda.org/t//conda-forge/noarch/nbformat-5.4.0-pyhd8ed1ab_0.tar.bz2#770f6659243e2c79a0b8488b0e463bd1 -https://conda.anaconda.org/t//conda-forge/linux-64/pybtex-docutils-1.0.1-py39hf3d152e_1.tar.bz2#d1febb9b765260fe964d3d4e3cc22479 -https://conda.anaconda.org/t//conda-forge/noarch/pyopenssl-22.0.0-pyhd8ed1ab_0.tar.bz2#1d7e241dfaf5475e893d4b824bb71b44 -https://conda.anaconda.org/t//conda-forge/linux-64/scikit-learn-1.0.2-py39h4dfa638_0.tar.bz2#389a22c8af77c8575a8a848fef79790e -https://conda.anaconda.org/t//conda-forge/linux-64/secretstorage-3.3.2-py39hf3d152e_1.tar.bz2#fd79e4bf83b5a19ceda3508a2adc2eb1 -https://conda.anaconda.org/t//conda-forge/noarch/wcwidth-0.2.5-pyh9f0ad1d_2.tar.bz2#5266fcd697043c59621fda522b3d78ee -https://conda.anaconda.org/t//conda-forge/noarch/altair_data_server-0.4.1-py_0.tar.bz2#5412ec3d2792d3e2f18e075cb05ffdaf -https://conda.anaconda.org/t//conda-forge/noarch/jupytext-1.13.8-pyh4b9bcc7_0.tar.bz2#aba00353637aa69640ec2ae150dc592d -https://conda.anaconda.org/t//conda-forge/linux-64/keyring-23.4.0-py39hf3d152e_2.tar.bz2#ad48fe501f6b4804e8c15a474f9c8968 -https://conda.anaconda.org/t//conda-forge/noarch/nbclient-0.5.13-pyhd8ed1ab_0.tar.bz2#3edde88a191701cf052216c4ba353a83 -https://conda.anaconda.org/t//conda-forge/noarch/prompt-toolkit-3.0.29-pyha770c72_0.tar.bz2#9e720b57b22ef3032b4fb081697819dd -https://conda.anaconda.org/t//conda-forge/noarch/urllib3-1.26.9-pyhd8ed1ab_0.tar.bz2#0ea179ee251aa7100807c35bc0252693 -https://conda.anaconda.org/t//conda-forge/linux-64/ipython-8.3.0-py39hf3d152e_0.tar.bz2#731e626303529ba6205adb1014686858 -https://conda.anaconda.org/t//conda-forge/noarch/nbconvert-core-6.5.0-pyhd8ed1ab_0.tar.bz2#42f74c4b38a099025167e76a7437edf1 -https://conda.anaconda.org/t//conda-forge/noarch/requests-2.27.1-pyhd8ed1ab_0.tar.bz2#7c1c427246b057b8fa97200ecdb2ed62 -https://conda.anaconda.org/t//conda-forge/noarch/cachecontrol-0.12.11-pyhd8ed1ab_0.tar.bz2#6eefee9888f33f150b5d44d616b1a613 -https://conda.anaconda.org/t//conda-forge/linux-64/conda-4.12.0-py39hf3d152e_0.tar.bz2#fb54573fc3909e06c3f289e0fbf9ca3d -https://conda.anaconda.org/t//conda-forge/noarch/ensureconda-1.4.2-pyhd8ed1ab_0.tar.bz2#1bf97b25d058482fd73e62b1cdb932ef -https://conda.anaconda.org/t//conda-forge/linux-64/ipykernel-6.13.0-py39hef51801_0.tar.bz2#9fd2a497e68e4c715d9fdf3f36b44072 -https://conda.anaconda.org/t//conda-forge/noarch/jupyter_server-1.17.0-pyhd8ed1ab_0.tar.bz2#276b3b45443e2a84cfb4d128cb86c350 -https://conda.anaconda.org/t//conda-forge/noarch/nbconvert-pandoc-6.5.0-pyhd8ed1ab_0.tar.bz2#d7421adfc67100021d87032447066129 -https://conda.anaconda.org/t//conda-forge/noarch/pooch-1.6.0-pyhd8ed1ab_0.tar.bz2#6429e1d1091c51f626b5dcfdd38bf429 -https://conda.anaconda.org/t//conda-forge/noarch/requests-toolbelt-0.9.1-py_0.tar.bz2#402668adee8fcba9a9c265cdc2a88f5a -https://conda.anaconda.org/t//conda-forge/noarch/sphinx-4.5.0-pyh6c4a22f_0.tar.bz2#46b38d88c4270ff9ba78a89c83c66345 -https://conda.anaconda.org/t//conda-forge/noarch/jupyter-server-mathjax-0.2.5-pyhc268e32_0.tar.bz2#0393370c2dec5e92e1727a8650f908f7 -https://conda.anaconda.org/t//conda-forge/noarch/jupyterlab_server-2.13.0-pyhd8ed1ab_1.tar.bz2#ecead930bfd8c0f629c5b8bf5c1e3508 -https://conda.anaconda.org/t//conda-forge/linux-64/mamba-0.23.0-py39hfa8f2c8_1.tar.bz2#5aeaa3d80ebfbbf408d46daf81bcd447 -https://conda.anaconda.org/t//conda-forge/noarch/myst-parser-0.15.2-pyhd8ed1ab_0.tar.bz2#0c2976e0a1af80ce224388da557eeece -https://conda.anaconda.org/t//conda-forge/noarch/nbconvert-6.5.0-pyhd8ed1ab_0.tar.bz2#156c180588e38b9f41758058824ec50f -https://conda.anaconda.org/t//conda-forge/noarch/notebook-shim-0.1.0-pyhd8ed1ab_0.tar.bz2#3a8e2c7dcc674f2cb0784f1faba57055 -https://conda.anaconda.org/t//conda-forge/linux-64/poetry-1.1.13-py39hf3d152e_1.tar.bz2#a10e45641e7ef946b8c4802c35e7fd44 -https://conda.anaconda.org/t//conda-forge/noarch/pydata-sphinx-theme-0.7.2-pyhd8ed1ab_0.tar.bz2#123f5c52d6b4117225af45a52ec34997 -https://conda.anaconda.org/t//conda-forge/noarch/sphinx-comments-0.0.3-pyh9f0ad1d_0.tar.bz2#2ae3ce35de0c1cec45c94182694f8d1b -https://conda.anaconda.org/t//conda-forge/noarch/sphinx-copybutton-0.5.0-pyhd8ed1ab_0.tar.bz2#4c969cdd5191306c269490f7ff236d9c -https://conda.anaconda.org/t//conda-forge/noarch/sphinx-external-toc-0.2.4-pyhd8ed1ab_0.tar.bz2#91ae8770569b73f25e1127526c1329ed -https://conda.anaconda.org/t//conda-forge/noarch/sphinx-jupyterbook-latex-0.4.6-pyhd8ed1ab_0.tar.bz2#6e4a69a0c8adbb48178fdf3efa24fa4c -https://conda.anaconda.org/t//conda-forge/noarch/sphinx-multitoc-numbering-0.1.3-pyhd8ed1ab_0.tar.bz2#40749a4d0f0d2e11c65fb26c1cd16a90 -https://conda.anaconda.org/t//conda-forge/noarch/sphinx-panels-0.6.0-pyhd8ed1ab_0.tar.bz2#6eec6480601f5d15babf9c3b3987f34a -https://conda.anaconda.org/t//conda-forge/noarch/sphinx-thebe-0.1.2-pyhd8ed1ab_0.tar.bz2#1d4fdd342aa955085a0f21e26bb585f7 -https://conda.anaconda.org/t//conda-forge/noarch/sphinx-togglebutton-0.3.1-pyhd8ed1ab_0.tar.bz2#71418887aa6599ea2935f4958e5e1d15 -https://conda.anaconda.org/t//conda-forge/noarch/sphinxcontrib-bibtex-2.4.2-pyhd8ed1ab_0.tar.bz2#d826ac2b3edfe7a8113596c2023f092b -https://conda.anaconda.org/t//conda-forge/noarch/conda-lock-1.0.5-pyhd8ed1ab_0.tar.bz2#7544764bbf4941cc954bb80911f3b201 -https://conda.anaconda.org/t//conda-forge/noarch/nbdime-3.1.1-pyhd8ed1ab_0.tar.bz2#38dc061ffabe665b79f4c7c52cefa809 -https://conda.anaconda.org/t//conda-forge/noarch/notebook-6.4.11-pyha770c72_0.tar.bz2#da25720a88aa3cbb3e16df740783da74 -https://conda.anaconda.org/t//conda-forge/noarch/sphinx-book-theme-0.1.10-pyhd8ed1ab_1.tar.bz2#194ec0159031da65b653c665bb1678f6 -https://conda.anaconda.org/t//conda-forge/noarch/jupyter-cache-0.4.3-pyhd8ed1ab_0.tar.bz2#03cd9218c96d513854bfc8714eaf9451 -https://conda.anaconda.org/t//conda-forge/noarch/jupyter_contrib_core-0.3.3-py_2.tar.bz2#4704781b0a914d67e4ae8e9f9f5c37a0 -https://conda.anaconda.org/t//conda-forge/noarch/nbclassic-0.3.7-pyhd8ed1ab_0.tar.bz2#a8a7139140a7512c90514444444a4991 -https://conda.anaconda.org/t//conda-forge/linux-64/widgetsnbextension-3.6.0-py39hf3d152e_0.tar.bz2#eb9f42c0ef3263a1d7bad8b0deb446f5 -https://conda.anaconda.org/t//conda-forge/noarch/ipywidgets-7.7.0-pyhd8ed1ab_0.tar.bz2#a3d2ccd3d9f9fcb65765c22f500529b4 -https://conda.anaconda.org/t//conda-forge/linux-64/jupyter_highlight_selected_word-0.2.0-py39hf3d152e_1005.tar.bz2#a8cf0e4e8a3289e920c640e4c8ac843a -https://conda.anaconda.org/t//conda-forge/noarch/jupyter_latex_envs-1.4.6-pyhd8ed1ab_1002.tar.bz2#4b888fd7d6b4cdb6736878b2cf8ea951 -https://conda.anaconda.org/t//conda-forge/noarch/jupyter_nbextensions_configurator-0.4.1-pyhd8ed1ab_2.tar.bz2#19a2fa481008976df3ed8ce5d4dfb8fa -https://conda.anaconda.org/t//conda-forge/noarch/jupyterlab-3.4.0-pyhd8ed1ab_0.tar.bz2#79d9efa21a8dbe3f9fdbe8069a61ac26 -https://conda.anaconda.org/t//conda-forge/noarch/jupyter-sphinx-0.3.2-pyhd8ed1ab_1.tar.bz2#a47ca0f91417e5d29d075ca416254466 -https://conda.anaconda.org/t//conda-forge/noarch/jupyter_contrib_nbextensions-0.5.1-pyhd8ed1ab_2.tar.bz2#ee3820cb73867efb8c928b93e55f4de3 -https://conda.anaconda.org/t//conda-forge/noarch/myst-nb-0.13.2-pyhd8ed1ab_0.tar.bz2#800e968e63eb593c651a14907fd82d26 -https://conda.anaconda.org/t//conda-forge/noarch/jupyter-book-0.12.3-pyhd8ed1ab_0.tar.bz2#1d6c5efa323c2cfd6196af5524aa5b78 diff --git a/unused/install/conda-osx-64.lock b/unused/install/conda-osx-64.lock deleted file mode 100644 index f5d2c64f..00000000 --- a/unused/install/conda-osx-64.lock +++ /dev/null @@ -1,259 +0,0 @@ -# Generated by conda-lock. -# platform: osx-64 -# input_hash: 866774b8507a84d7b8151db0d5e21fce1654997b6927d18dbd4bd16e2a387ee1 -@EXPLICIT -https://conda.anaconda.org/t//conda-forge/osx-64/bzip2-1.0.8-h0d85af4_4.tar.bz2#37edc4e6304ca87316e160f5ca0bd1b5 -https://conda.anaconda.org/t//conda-forge/osx-64/c-ares-1.18.1-h0d85af4_0.tar.bz2#00b3e98a61e6430808fe7a2534681f28 -https://conda.anaconda.org/t//conda-forge/osx-64/ca-certificates-2021.10.8-h033912b_0.tar.bz2#bb82d0243db9882b509702ecb69e38f0 -https://conda.anaconda.org/t//conda-forge/osx-64/libcxx-14.0.3-hc203e6f_0.tar.bz2#169cb4aae14ca89851deda1756b2ebf7 -https://conda.anaconda.org/t//conda-forge/osx-64/libev-4.33-haf1e3a3_1.tar.bz2#79dc2be110b2a3d1e97ec21f691c50ad -https://conda.anaconda.org/t//conda-forge/osx-64/libffi-3.4.2-h0d85af4_5.tar.bz2#ccb34fb14960ad8b125962d3d79b31a9 -https://conda.anaconda.org/t//conda-forge/osx-64/libiconv-1.16-haf1e3a3_0.tar.bz2#c5fab167412a52e491c8e11453ae016f -https://conda.anaconda.org/t//conda-forge/osx-64/libsodium-1.0.18-hbcb3906_1.tar.bz2#24632c09ed931af617fe6d5292919cab -https://conda.anaconda.org/t//conda-forge/osx-64/libuv-1.43.0-h0d85af4_0.tar.bz2#f68a2895786d91cdae79116b2b014592 -https://conda.anaconda.org/t//conda-forge/osx-64/libzlib-1.2.11-h6c3fc93_1014.tar.bz2#49f20ed86f7ed34204be74fbb2868c60 -https://conda.anaconda.org/t//conda-forge/osx-64/llvm-openmp-14.0.3-ha654fa7_0.tar.bz2#321df8a58df1427266e9f9c18af3f7f5 -https://conda.anaconda.org/t//conda-forge/osx-64/lzo-2.10-haf1e3a3_1000.tar.bz2#0b6bca372a95d6c602c7a922e928ce79 -https://conda.anaconda.org/t//conda-forge/osx-64/ncurses-6.3-h96cf925_1.tar.bz2#76217ebfbb163ff2770a261f955a5861 -https://conda.anaconda.org/t//conda-forge/osx-64/pandoc-2.18-h694c41f_0.tar.bz2#42f9f041e9d250ee3bc7332b8d28e0a2 -https://conda.anaconda.org/t//conda-forge/osx-64/perl-5.32.1-2_h0d85af4_perl5.tar.bz2#dd13a8c2fac0cd8e102fcdc7bca1f077 -https://conda.anaconda.org/t//conda-forge/noarch/pybind11-abi-4-hd8ed1ab_3.tar.bz2#878f923dd6acc8aeb47a75da6c4098be -https://conda.anaconda.org/t//conda-forge/osx-64/reproc-14.2.3-h0d85af4_0.tar.bz2#6f87f4707e4daf5823ce56dce5d9fbea -https://conda.anaconda.org/t//conda-forge/noarch/tzdata-2022a-h191b570_0.tar.bz2#84be5301069417a2221187d2f435e0f7 -https://conda.anaconda.org/t//conda-forge/osx-64/xz-5.2.5-haf1e3a3_1.tar.bz2#41116deb499e9bc58048c297d6403ce6 -https://conda.anaconda.org/t//conda-forge/osx-64/yaml-0.2.5-h0d85af4_2.tar.bz2#d7e08fcf8259d742156188e8762b4d20 -https://conda.anaconda.org/t//conda-forge/osx-64/expat-2.4.8-h96cf925_0.tar.bz2#529d357c143fb98b9af77d687f82a3e0 -https://conda.anaconda.org/t//conda-forge/osx-64/gettext-0.19.8.1-hd1a6beb_1008.tar.bz2#28c370fc39becf486601d9e491a5e184 -https://conda.anaconda.org/t//conda-forge/osx-64/icu-70.1-h96cf925_0.tar.bz2#376635049e9b9b0bb875efd39dcd7b3b -https://conda.anaconda.org/t//conda-forge/osx-64/libedit-3.1.20191231-h0678c8f_2.tar.bz2#6016a8a1d0e63cac3de2c352cd40208b -https://conda.anaconda.org/t//conda-forge/osx-64/libgfortran5-9.3.0-h6c81a4c_23.tar.bz2#a6956ceb628b14594613cefee5127a7a -https://conda.anaconda.org/t//conda-forge/osx-64/libsolv-0.7.22-hd9580d2_0.tar.bz2#068ed0617893ecbccbf65a32ea1e8056 -https://conda.anaconda.org/t//conda-forge/osx-64/lz4-c-1.9.3-he49afe7_1.tar.bz2#05c08241b66631c00ca4f9e0b75320bc -https://conda.anaconda.org/t//conda-forge/osx-64/openssl-1.1.1o-hfe4f2af_0.tar.bz2#655048e118f0b7029e5c216a1d7a6189 -https://conda.anaconda.org/t//conda-forge/osx-64/readline-8.1-h05e3726_0.tar.bz2#2832e9b6a7caa7cb192fcda6cfcd8871 -https://conda.anaconda.org/t//conda-forge/osx-64/reproc-cpp-14.2.3-he49afe7_0.tar.bz2#7dafcfaa471cd16cbd73832cefc39770 -https://conda.anaconda.org/t//conda-forge/osx-64/tk-8.6.12-h5dbffcc_0.tar.bz2#8e9480d9c47061db2ed1b4ecce519a7f -https://conda.anaconda.org/t//conda-forge/osx-64/yaml-cpp-0.6.3-hb1e8313_4.tar.bz2#f56440cd47d05468d6fb2020f98002d8 -https://conda.anaconda.org/t//conda-forge/osx-64/zeromq-4.3.4-he49afe7_1.tar.bz2#1972d732b123ed04b60fd21e94f0b178 -https://conda.anaconda.org/t//conda-forge/osx-64/zlib-1.2.11-h6c3fc93_1014.tar.bz2#98b82f6a8de694bc6259f2d1a69bc02b -https://conda.anaconda.org/t//conda-forge/osx-64/krb5-1.19.3-hb49756b_0.tar.bz2#e60363be26ab2a74326c06195d638447 -https://conda.anaconda.org/t//conda-forge/osx-64/libgfortran-5.0.0-9_3_0_h6c81a4c_23.tar.bz2#60f48cef2d50674e0428c5579b6c3f66 -https://conda.anaconda.org/t//conda-forge/osx-64/libnghttp2-1.47.0-h942079c_0.tar.bz2#86fc370e607a269b64ac6fa5d29e55e8 -https://conda.anaconda.org/t//conda-forge/osx-64/libssh2-1.10.0-h52ee1ee_2.tar.bz2#8c8f3804e8e252b47443cfe8e40eddf9 -https://conda.anaconda.org/t//conda-forge/osx-64/libxml2-2.9.14-h08a9926_0.tar.bz2#3f1b05fc03318121ba2c5eabbf28be2f -https://conda.anaconda.org/t//conda-forge/osx-64/nodejs-17.9.0-h3cde592_0.tar.bz2#8148f6cc0609bc58be41ecf4563c4280 -https://conda.anaconda.org/t//conda-forge/osx-64/pcre2-10.37-ha16e1b2_0.tar.bz2#a3be43be1fcee194d1e82e5ca2ce47bc -https://conda.anaconda.org/t//conda-forge/osx-64/sqlite-3.38.5-hd9f0692_0.tar.bz2#258c39c5e2eff8b8b29d1a027e4e1b5a -https://conda.anaconda.org/t//conda-forge/osx-64/zstd-1.5.2-h582d3a0_0.tar.bz2#df9ed22d1725b8fecaede2d579fd6bc4 -https://conda.anaconda.org/t//conda-forge/osx-64/libarchive-3.5.2-h2b60450_1.tar.bz2#598a850147b8622e8348cbf6578effaf -https://conda.anaconda.org/t//conda-forge/osx-64/libcurl-7.83.0-h372c54d_0.tar.bz2#189d7b818b1edae0199fd3f9dfdc072e -https://conda.anaconda.org/t//conda-forge/osx-64/libopenblas-0.3.20-openmp_hb3cd9ec_0.tar.bz2#d862e4a5c6e7bf0bc9d66a38f5c73142 -https://conda.anaconda.org/t//conda-forge/osx-64/libxslt-1.1.33-h5bff336_4.tar.bz2#885c26849179f63d56d3d0f1d52dbd2a -https://conda.anaconda.org/t//conda-forge/osx-64/python-3.9.12-h8b4d769_1_cpython.tar.bz2#ebd20128c3a2f2fe98c9e6562cf43f65 -https://conda.anaconda.org/t//conda-forge/noarch/alabaster-0.7.12-py_0.tar.bz2#2489a97287f90176ecdc3ca982b4b0a0 -https://conda.anaconda.org/t//conda-forge/noarch/appdirs-1.4.4-pyh9f0ad1d_0.tar.bz2#5f095bc6454094e96f146491fd03633b -https://conda.anaconda.org/t//conda-forge/noarch/appnope-0.1.3-pyhd8ed1ab_0.tar.bz2#54ac328d703bff191256ffa1183126d1 -https://conda.anaconda.org/t//conda-forge/noarch/argh-0.26.2-pyh9f0ad1d_1002.tar.bz2#0af89261f0352895e1c1000d306b3dc7 -https://conda.anaconda.org/t//conda-forge/noarch/attrs-21.4.0-pyhd8ed1ab_0.tar.bz2#f70280205d7044c8b8358c8de3190e5d -https://conda.anaconda.org/t//conda-forge/noarch/backcall-0.2.0-pyh9f0ad1d_0.tar.bz2#6006a6d08a3fa99268a2681c7fb55213 -https://conda.anaconda.org/t//conda-forge/noarch/backports-1.0-py_2.tar.bz2#0da16b293affa6ac31812376f8eb79dd -https://conda.anaconda.org/t//conda-forge/noarch/cachy-0.3.0-py_0.tar.bz2#808c46dc56ae4a796830129aaf1b51ec -https://conda.anaconda.org/t//conda-forge/noarch/charset-normalizer-2.0.12-pyhd8ed1ab_0.tar.bz2#1f5b32dabae0f1893ae3283dac7f799e -https://conda.anaconda.org/t//conda-forge/noarch/colorama-0.4.4-pyh9f0ad1d_0.tar.bz2#c08b4c1326b880ed44f3ffb04803332f -https://conda.anaconda.org/t//conda-forge/noarch/crashtest-0.3.1-pyhd8ed1ab_0.tar.bz2#b8477552274c1cfdb533e954c76523f1 -https://conda.anaconda.org/t//conda-forge/osx-64/curl-7.83.0-h372c54d_0.tar.bz2#70cc7e2acae7f738c6c7afa94a2b7233 -https://conda.anaconda.org/t//conda-forge/noarch/dataclasses-0.8-pyhc8e2a94_3.tar.bz2#a362b2124b06aad102e2ee4581acee7d -https://conda.anaconda.org/t//conda-forge/noarch/decorator-5.1.1-pyhd8ed1ab_0.tar.bz2#43afe5ab04e35e17ba28649471dd7364 -https://conda.anaconda.org/t//conda-forge/noarch/defusedxml-0.7.1-pyhd8ed1ab_0.tar.bz2#961b3a227b437d82ad7054484cfa71b2 -https://conda.anaconda.org/t//conda-forge/noarch/distlib-0.3.4-pyhd8ed1ab_0.tar.bz2#7b50d840543d9cdae100e91582c33035 -https://conda.anaconda.org/t//conda-forge/noarch/entrypoints-0.4-pyhd8ed1ab_0.tar.bz2#3cf04868fee0a029769bd41f4b2fbf2d -https://conda.anaconda.org/t//conda-forge/noarch/executing-0.8.3-pyhd8ed1ab_0.tar.bz2#8d70f4543c1f701b946f85e9f9a00800 -https://conda.anaconda.org/t//conda-forge/noarch/filelock-3.6.0-pyhd8ed1ab_0.tar.bz2#6e03ca6c7b47a4152a2b12c6eee3bd32 -https://conda.anaconda.org/t//conda-forge/noarch/flit-core-3.7.1-pyhd8ed1ab_0.tar.bz2#f93822cba5c20161560661988a88f2c0 -https://conda.anaconda.org/t//conda-forge/noarch/idna-3.3-pyhd8ed1ab_0.tar.bz2#40b50b8b030f5f2f22085c062ed013dd -https://conda.anaconda.org/t//conda-forge/noarch/imagesize-1.3.0-pyhd8ed1ab_0.tar.bz2#be807e7606fff9436e5e700f6bffb7c6 -https://conda.anaconda.org/t//conda-forge/noarch/ipython_genutils-0.2.0-py_1.tar.bz2#5071c982548b3a20caf70462f04f5287 -https://conda.anaconda.org/t//conda-forge/noarch/json5-0.9.5-pyh9f0ad1d_0.tar.bz2#10759827a94e6b14996e81fb002c0bda -https://conda.anaconda.org/t//conda-forge/noarch/jupyterlab_widgets-1.1.0-pyhd8ed1ab_0.tar.bz2#e963a4a39cf442dbe5503f66edda083d -https://conda.anaconda.org/t//conda-forge/osx-64/libblas-3.9.0-14_osx64_openblas.tar.bz2#7440571e6f75b795ebc25a71429ca99d -https://conda.anaconda.org/t//conda-forge/osx-64/libmamba-0.23.0-h2d3d89a_1.tar.bz2#0b9cec67dcaa4a72263b5f5fd123050f -https://conda.anaconda.org/t//conda-forge/noarch/lockfile-0.12.2-py_1.tar.bz2#c104d98e09c47519950cffb8dd5b4f10 -https://conda.anaconda.org/t//conda-forge/noarch/nest-asyncio-1.5.5-pyhd8ed1ab_0.tar.bz2#dc36c992aec485c0efff619ed2e63957 -https://conda.anaconda.org/t//conda-forge/noarch/pandocfilters-1.5.0-pyhd8ed1ab_0.tar.bz2#457c2c8c08e54905d6954e79cb5b5db9 -https://conda.anaconda.org/t//conda-forge/noarch/parso-0.8.3-pyhd8ed1ab_0.tar.bz2#17a565a0c3899244e938cdf417e7b094 -https://conda.anaconda.org/t//conda-forge/noarch/pastel-0.2.1-pyhd8ed1ab_0.tar.bz2#a4eea5bff523f26442405bc5d1f52adb -https://conda.anaconda.org/t//conda-forge/noarch/pickleshare-0.7.5-py_1003.tar.bz2#415f0ebb6198cc2801c73438a9fb5761 -https://conda.anaconda.org/t//conda-forge/noarch/pkginfo-1.8.2-pyhd8ed1ab_0.tar.bz2#c776a1cd5745674c28c20a5498cafa89 -https://conda.anaconda.org/t//conda-forge/noarch/platformdirs-2.5.1-pyhd8ed1ab_0.tar.bz2#d5df87964a39f67c46a5448f4e78d9b6 -https://conda.anaconda.org/t//conda-forge/noarch/prometheus_client-0.14.1-pyhd8ed1ab_0.tar.bz2#b7fa7d86530b8de805268e48988eb483 -https://conda.anaconda.org/t//conda-forge/noarch/ptyprocess-0.7.0-pyhd3deb0d_0.tar.bz2#359eeb6536da0e687af562ed265ec263 -https://conda.anaconda.org/t//conda-forge/noarch/pure_eval-0.2.2-pyhd8ed1ab_0.tar.bz2#6784285c7e55cb7212efabc79e4c2883 -https://conda.anaconda.org/t//conda-forge/noarch/pycparser-2.21-pyhd8ed1ab_0.tar.bz2#076becd9e05608f8dc72757d5f3a91ff -https://conda.anaconda.org/t//conda-forge/noarch/pylev-1.4.0-pyhd8ed1ab_0.tar.bz2#edf8651c4379d9d1495ad6229622d150 -https://conda.anaconda.org/t//conda-forge/noarch/pyparsing-3.0.8-pyhd8ed1ab_0.tar.bz2#7f5738c49fdccd0fc755bfd25a5ea66c -https://conda.anaconda.org/t//conda-forge/noarch/python-fastjsonschema-2.15.3-pyhd8ed1ab_0.tar.bz2#fae309d1cc996da1f63de9d321e65e27 -https://conda.anaconda.org/t//conda-forge/osx-64/python_abi-3.9-2_cp39.tar.bz2#262f557ee8ca777fe2190956038024cd -https://conda.anaconda.org/t//conda-forge/noarch/pytz-2022.1-pyhd8ed1ab_0.tar.bz2#b87d66d6d3991d988fb31510c95a9267 -https://conda.anaconda.org/t//conda-forge/noarch/send2trash-1.8.0-pyhd8ed1ab_0.tar.bz2#edab14119efe85c3bf131ad747e9005c -https://conda.anaconda.org/t//conda-forge/noarch/shellingham-1.4.0-pyh44b312d_0.tar.bz2#437655338696f9d0dfdb0a024e66b255 -https://conda.anaconda.org/t//conda-forge/noarch/six-1.16.0-pyh6c4a22f_0.tar.bz2#e5f25f8dbc060e9a8d912e432202afc2 -https://conda.anaconda.org/t//conda-forge/noarch/smmap-3.0.5-pyh44b312d_0.tar.bz2#3a8dc70789709aa315325d5df06fb7e4 -https://conda.anaconda.org/t//conda-forge/noarch/snowballstemmer-2.2.0-pyhd8ed1ab_0.tar.bz2#4d22a9315e78c6827f806065957d566e -https://conda.anaconda.org/t//conda-forge/noarch/soupsieve-2.3.1-pyhd8ed1ab_0.tar.bz2#d821b295c4bd18ad27e1e19543a5784a -https://conda.anaconda.org/t//conda-forge/noarch/sphinxcontrib-applehelp-1.0.2-py_0.tar.bz2#20b2eaeaeea4ef9a9a0d99770620fd09 -https://conda.anaconda.org/t//conda-forge/noarch/sphinxcontrib-devhelp-1.0.2-py_0.tar.bz2#68e01cac9d38d0e717cd5c87bc3d2cc9 -https://conda.anaconda.org/t//conda-forge/noarch/sphinxcontrib-htmlhelp-2.0.0-pyhd8ed1ab_0.tar.bz2#77dad82eb9c8c1525ff7953e0756d708 -https://conda.anaconda.org/t//conda-forge/noarch/sphinxcontrib-jsmath-1.0.1-py_0.tar.bz2#67cd9d9c0382d37479b4d306c369a2d4 -https://conda.anaconda.org/t//conda-forge/noarch/sphinxcontrib-qthelp-1.0.3-py_0.tar.bz2#d01180388e6d1838c3e1ad029590aa7a -https://conda.anaconda.org/t//conda-forge/noarch/sphinxcontrib-serializinghtml-1.1.5-pyhd8ed1ab_2.tar.bz2#9ff55a0901cf952f05c654394de76bf7 -https://conda.anaconda.org/t//conda-forge/noarch/threadpoolctl-3.1.0-pyh8a188c0_0.tar.bz2#a2995ee828f65687ac5b1e71a2ab1e0c -https://conda.anaconda.org/t//conda-forge/noarch/toml-0.10.2-pyhd8ed1ab_0.tar.bz2#f832c45a477c78bebd107098db465095 -https://conda.anaconda.org/t//conda-forge/noarch/toolz-0.11.2-pyhd8ed1ab_0.tar.bz2#f348d1590550371edfac5ed3c1d44f7e -https://conda.anaconda.org/t//conda-forge/noarch/traitlets-5.2.0-pyhd8ed1ab_0.tar.bz2#b81786ff00b93d07560ea21d98a2b266 -https://conda.anaconda.org/t//conda-forge/noarch/typing-3.10.0.0-pyhd8ed1ab_0.tar.bz2#e6573ac68718f17b9d4f5c8eda3190f2 -https://conda.anaconda.org/t//conda-forge/noarch/typing_extensions-4.2.0-pyha770c72_1.tar.bz2#f0f7e024f94e23d3bfee0ab777bf335a -https://conda.anaconda.org/t//conda-forge/noarch/uc-micro-py-1.0.1-pyhd8ed1ab_0.tar.bz2#3ddf6684d9b274a12c94e509ca45656c -https://conda.anaconda.org/t//conda-forge/noarch/webencodings-0.5.1-py_1.tar.bz2#3563be4c5611a44210d9ba0c16113136 -https://conda.anaconda.org/t//conda-forge/noarch/websocket-client-1.3.2-pyhd8ed1ab_0.tar.bz2#da6f472c62b4eda0caf05e223729efcd -https://conda.anaconda.org/t//conda-forge/noarch/wheel-0.37.1-pyhd8ed1ab_0.tar.bz2#1ca02aaf78d9c70d9a81a3bed5752022 -https://conda.anaconda.org/t//conda-forge/noarch/zipp-3.8.0-pyhd8ed1ab_0.tar.bz2#050b94cf4a8c760656e51d2d44e4632c -https://conda.anaconda.org/t//conda-forge/noarch/asttokens-2.0.5-pyhd8ed1ab_0.tar.bz2#74badce16f060701fee55c39332f5253 -https://conda.anaconda.org/t//conda-forge/noarch/babel-2.10.1-pyhd8ed1ab_0.tar.bz2#2ec70a4a964b696170d730466c668f60 -https://conda.anaconda.org/t//conda-forge/noarch/beautifulsoup4-4.11.1-pyha770c72_0.tar.bz2#eeec8814bd97b2681f708bb127478d7d -https://conda.anaconda.org/t//conda-forge/osx-64/certifi-2021.10.8-py39h6e9494a_2.tar.bz2#087647d724d2a4c2859e203965f8f532 -https://conda.anaconda.org/t//conda-forge/osx-64/cffi-1.15.0-py39he338e87_0.tar.bz2#eb43d870c7a57ac917962fbb68455677 -https://conda.anaconda.org/t//conda-forge/osx-64/click-8.1.3-py39h6e9494a_0.tar.bz2#c33ff559cea7a89797decd6db3716b77 -https://conda.anaconda.org/t//conda-forge/noarch/clikit-0.6.2-pyh9f0ad1d_0.tar.bz2#159273f717a11e53b2656f8b6521a5e2 -https://conda.anaconda.org/t//conda-forge/osx-64/debugpy-1.6.0-py39hfd1d529_0.tar.bz2#43ec1c95677f4b111336cf091bdd064a -https://conda.anaconda.org/t//conda-forge/osx-64/docutils-0.16-py39h6e9494a_3.tar.bz2#c251f3d6defc0e68aef889b41e43df60 -https://conda.anaconda.org/t//conda-forge/osx-64/git-2.35.3-pl5321h33a4a8a_0.tar.bz2#f6e47a7e75c9b7fe6f14c97ee60918d6 -https://conda.anaconda.org/t//conda-forge/noarch/gitdb-4.0.9-pyhd8ed1ab_0.tar.bz2#40fc6b14a45dee3a3fd9f302d026108e -https://conda.anaconda.org/t//conda-forge/osx-64/greenlet-1.1.2-py39hfd1d529_2.tar.bz2#18ffa500d722e7d42705f5c75c003b80 -https://conda.anaconda.org/t//conda-forge/noarch/html5lib-1.1-pyh9f0ad1d_0.tar.bz2#b2355343d6315c892543200231d7154a -https://conda.anaconda.org/t//conda-forge/osx-64/importlib-metadata-4.11.3-py39h6e9494a_1.tar.bz2#9795d43694c096de21d356d286e34320 -https://conda.anaconda.org/t//conda-forge/noarch/importlib_resources-5.7.1-pyhd8ed1ab_0.tar.bz2#8a50c32f48abec73bc3dd4df0d133892 -https://conda.anaconda.org/t//conda-forge/osx-64/jedi-0.18.1-py39h6e9494a_1.tar.bz2#17fd05dd01cc6e30715af4a5703cf0ef -https://conda.anaconda.org/t//conda-forge/osx-64/jupyter_core-4.9.2-py39h6e9494a_0.tar.bz2#cbeae523c59b27307332bd81c08c1a2b -https://conda.anaconda.org/t//conda-forge/noarch/latexcodec-2.0.1-pyh9f0ad1d_0.tar.bz2#8d67904973263afd2985ba56aa2d6bb4 -https://conda.anaconda.org/t//conda-forge/osx-64/libcblas-3.9.0-14_osx64_openblas.tar.bz2#09a2b714708e1c4320266fb878c54a3c -https://conda.anaconda.org/t//conda-forge/osx-64/liblapack-3.9.0-14_osx64_openblas.tar.bz2#154dd37ef8a9c421bee629d030550a30 -https://conda.anaconda.org/t//conda-forge/osx-64/libmambapy-0.23.0-py39h3f08081_1.tar.bz2#081304f648bd313d010a9db591470d48 -https://conda.anaconda.org/t//conda-forge/noarch/linkify-it-py-1.0.3-pyhd8ed1ab_0.tar.bz2#ba4b07f6a132c77eb69ede31a6ed790b -https://conda.anaconda.org/t//conda-forge/osx-64/lxml-4.8.0-py39h63b48b0_3.tar.bz2#76e4798438383445c8b6c2462d20812c -https://conda.anaconda.org/t//conda-forge/noarch/markdown-it-py-1.1.0-pyhd8ed1ab_0.tar.bz2#84e8dfb1a9e6a824f32fd45b867271ca -https://conda.anaconda.org/t//conda-forge/osx-64/markupsafe-2.1.1-py39h63b48b0_1.tar.bz2#478672fe1f0c0fafc4e99152cd7e33fe -https://conda.anaconda.org/t//conda-forge/noarch/matplotlib-inline-0.1.3-pyhd8ed1ab_0.tar.bz2#be3bfd435802d2c768c6b2439f325f3d -https://conda.anaconda.org/t//conda-forge/osx-64/mistune-0.8.4-py39h89e85a6_1005.tar.bz2#761ee1d4bd35121fc36ec1a2b065cb99 -https://conda.anaconda.org/t//conda-forge/osx-64/msgpack-python-1.0.3-py39h7248d28_1.tar.bz2#d1f902b64e344a541dd2ecb6db544175 -https://conda.anaconda.org/t//conda-forge/noarch/packaging-20.9-pyh44b312d_0.tar.bz2#be69a38e912054a62dc82cc3c7711a64 -https://conda.anaconda.org/t//conda-forge/noarch/pexpect-4.8.0-pyh9f0ad1d_2.tar.bz2#5909e7b978141dd80d28dbf9de627827 -https://conda.anaconda.org/t//conda-forge/osx-64/poetry-core-1.0.8-py39h6e9494a_1.tar.bz2#d0e3d3795047ca328eeb399ea8118b56 -https://conda.anaconda.org/t//conda-forge/osx-64/psutil-5.9.0-py39h63b48b0_1.tar.bz2#7b458eead81bae468eb2f5c4a1d7c141 -https://conda.anaconda.org/t//conda-forge/osx-64/pycosat-0.6.3-py39h63b48b0_1010.tar.bz2#dfe087a3c51af3f3be906b8300fd03f2 -https://conda.anaconda.org/t//conda-forge/osx-64/pyrsistent-0.18.1-py39h63b48b0_1.tar.bz2#f30e91859ff641f638eeb49e1823c207 -https://conda.anaconda.org/t//conda-forge/osx-64/pysocks-1.7.1-py39h6e9494a_5.tar.bz2#e6845a71941ffc957c9e4ac0c4c88edd -https://conda.anaconda.org/t//conda-forge/noarch/python-dateutil-2.8.2-pyhd8ed1ab_0.tar.bz2#dd999d1cc9f79e67dbb855c8924c7984 -https://conda.anaconda.org/t//conda-forge/osx-64/pyyaml-6.0-py39h63b48b0_4.tar.bz2#d7aa7f40b99d7bc9e06b20a613d6a7b0 -https://conda.anaconda.org/t//conda-forge/osx-64/pyzmq-22.3.0-py39hc2dc7ec_2.tar.bz2#bfa4a6b43ccbd64fee3157d70274ddeb -https://conda.anaconda.org/t//conda-forge/osx-64/ruamel.yaml.clib-0.2.6-py39h63b48b0_1.tar.bz2#9318cdefc8c0bb7514870afa9aa72e37 -https://conda.anaconda.org/t//conda-forge/osx-64/ruamel_yaml-0.15.80-py39h89e85a6_1006.tar.bz2#a5d158eaeef49d101b6bee913b7f07a3 -https://conda.anaconda.org/t//conda-forge/osx-64/setuptools-62.1.0-py39h6e9494a_0.tar.bz2#d16c2aa67a8f229126295d265e644e88 -https://conda.anaconda.org/t//conda-forge/osx-64/sniffio-1.2.0-py39h6e9494a_3.tar.bz2#ecdb9ed5ed5e8fa34dd7786387c6263a -https://conda.anaconda.org/t//conda-forge/noarch/tinycss2-1.1.1-pyhd8ed1ab_0.tar.bz2#5d280406501e79dc7aa9c9ac31d25a80 -https://conda.anaconda.org/t//conda-forge/noarch/tomlkit-0.10.2-pyha770c72_0.tar.bz2#482e5775f80665a7c9f76cd72a66eae8 -https://conda.anaconda.org/t//conda-forge/osx-64/tornado-6.1-py39h63b48b0_3.tar.bz2#82c1e73cdc3ae881ef28d56a3a58225c -https://conda.anaconda.org/t//conda-forge/noarch/tqdm-4.64.0-pyhd8ed1ab_0.tar.bz2#6642233f341e1900d0c8e6eddb979c14 -https://conda.anaconda.org/t//conda-forge/noarch/typing-extensions-4.2.0-hd8ed1ab_1.tar.bz2#6d9d7480c5780514779967be2ee8b963 -https://conda.anaconda.org/t//conda-forge/osx-64/virtualenv-20.14.1-py39h6e9494a_0.tar.bz2#dc2e42e8457a7f53d98efbdded613328 -https://conda.anaconda.org/t//conda-forge/osx-64/anyio-3.5.0-py39h6e9494a_0.tar.bz2#b898a6cf2e081a521dd58285745b0762 -https://conda.anaconda.org/t//conda-forge/osx-64/argon2-cffi-bindings-21.2.0-py39h63b48b0_2.tar.bz2#8e0ec40d50d73eb7d63819c4b1939753 -https://conda.anaconda.org/t//conda-forge/noarch/backports.functools_lru_cache-1.6.4-pyhd8ed1ab_0.tar.bz2#c5b3edc62d6309088f4970b3eaaa65a6 -https://conda.anaconda.org/t//conda-forge/noarch/bleach-5.0.0-pyhd8ed1ab_0.tar.bz2#2a2ae7c56b8f72caba261363407b484a -https://conda.anaconda.org/t//conda-forge/osx-64/brotlipy-0.7.0-py39h63b48b0_1004.tar.bz2#7b42f41f7606f46a6b00ff4ac3c71b76 -https://conda.anaconda.org/t//conda-forge/noarch/cleo-0.8.1-pyhd8ed1ab_2.tar.bz2#4c82b11a3d06031bd58e7d869f53d965 -https://conda.anaconda.org/t//conda-forge/noarch/click-default-group-1.2.2-pyhd8ed1ab_1.tar.bz2#72a46ffc25701c173932fd55cf0965d3 -https://conda.anaconda.org/t//conda-forge/noarch/click-log-0.3.2-pyh9f0ad1d_0.tar.bz2#3a64d156136fad977df1b81a24b57ac0 -https://conda.anaconda.org/t//conda-forge/osx-64/conda-package-handling-1.8.1-py39h63b48b0_1.tar.bz2#09d59c368b74774aeb23183752bcb99f -https://conda.anaconda.org/t//conda-forge/osx-64/cryptography-36.0.2-py39h1644bb1_1.tar.bz2#aff910cb455e3885efc3e2e2070355b6 -https://conda.anaconda.org/t//conda-forge/noarch/ghp-import-2.1.0-pyhd8ed1ab_0.tar.bz2#6d8d61116031a3f5b1f32e7899785866 -https://conda.anaconda.org/t//conda-forge/noarch/gitpython-3.1.27-pyhd8ed1ab_0.tar.bz2#20acbaab17a50ac9b64138eb9a0e1af8 -https://conda.anaconda.org/t//conda-forge/noarch/importlib_metadata-4.11.3-hd8ed1ab_1.tar.bz2#bd6b6ae37c03e68061574d5e32fe5bd1 -https://conda.anaconda.org/t//conda-forge/noarch/jinja2-3.0.3-pyhd8ed1ab_0.tar.bz2#036d872c653780cb26e797e2e2f61b4c -https://conda.anaconda.org/t//conda-forge/noarch/joblib-1.1.0-pyhd8ed1ab_0.tar.bz2#07d1b5c8cde14d95998fd4767e1e62d2 -https://conda.anaconda.org/t//conda-forge/noarch/jsonschema-3.2.0-pyhd8ed1ab_3.tar.bz2#66125e28711d8ffc04a207a2b170316d -https://conda.anaconda.org/t//conda-forge/noarch/jupyter_client-7.3.1-pyhd8ed1ab_0.tar.bz2#38481a37ead8c37d2ad7b52d3bc2b0a7 -https://conda.anaconda.org/t//conda-forge/noarch/mdit-py-plugins-0.2.8-pyhd8ed1ab_0.tar.bz2#49236fcd746a124eb56d326f79e1d46d -https://conda.anaconda.org/t//conda-forge/osx-64/numpy-1.22.3-py39h214027c_2.tar.bz2#1188d955cb2a6808aac16a3d6de99c8f -https://conda.anaconda.org/t//conda-forge/noarch/pip-22.0.4-pyhd8ed1ab_0.tar.bz2#b1239ce8ef2a1eec485c398a683c5bff -https://conda.anaconda.org/t//conda-forge/noarch/portpicker-1.5.0-pyhd8ed1ab_0.tar.bz2#5f9595b4b3f50a0d572b0a7c8b4293c7 -https://conda.anaconda.org/t//conda-forge/noarch/pybtex-0.24.0-pyhd8ed1ab_2.tar.bz2#2099b86a7399c44c0c61cdb6de6915ba -https://conda.anaconda.org/t//conda-forge/osx-64/pydantic-1.9.0-py39h63b48b0_1.tar.bz2#86b0524a36495228560db966b4db2e9d -https://conda.anaconda.org/t//conda-forge/noarch/pygments-2.12.0-pyhd8ed1ab_0.tar.bz2#cb27e2ded147e5bcc7eafc1c6d343cb3 -https://conda.anaconda.org/t//conda-forge/osx-64/ruamel.yaml-0.17.21-py39h63b48b0_1.tar.bz2#71bfd0949c4080c77609f70f230ec875 -https://conda.anaconda.org/t//conda-forge/osx-64/sqlalchemy-1.4.36-py39h701faf5_0.tar.bz2#5cfa799d0bf4fe4a71d78c177bccaadb -https://conda.anaconda.org/t//conda-forge/noarch/stack_data-0.2.0-pyhd8ed1ab_0.tar.bz2#8c0ce3e6bf18a0c810125aef58a2a6f3 -https://conda.anaconda.org/t//conda-forge/osx-64/terminado-0.13.3-py39h6e9494a_1.tar.bz2#ed379b461931b5a65c45d1d7d899f327 -https://conda.anaconda.org/t//conda-forge/osx-64/watchdog-2.1.7-py39h147bbb7_1.tar.bz2#b5219ca4f9ff36da6997c66db7a2945c -https://conda.anaconda.org/t//conda-forge/noarch/argon2-cffi-21.3.0-pyhd8ed1ab_0.tar.bz2#a0b402db58f73aaab8ee0ca1025a362e -https://conda.anaconda.org/t//conda-forge/osx-64/click-completion-0.5.2-py39h6e9494a_3.tar.bz2#9aa698e05da37cc193be450d56d974e9 -https://conda.anaconda.org/t//conda-forge/noarch/jupyterlab_pygments-0.2.2-pyhd8ed1ab_0.tar.bz2#243f63592c8e449f40cd42eb5cf32f40 -https://conda.anaconda.org/t//conda-forge/osx-64/keyring-23.4.0-py39h6e9494a_2.tar.bz2#ffd5711dc153e8682bdb13a104544516 -https://conda.anaconda.org/t//conda-forge/noarch/nbformat-5.4.0-pyhd8ed1ab_0.tar.bz2#770f6659243e2c79a0b8488b0e463bd1 -https://conda.anaconda.org/t//conda-forge/osx-64/pandas-1.4.2-py39hbd61c47_1.tar.bz2#774ebb7e6c43776b8927eb043bb46577 -https://conda.anaconda.org/t//conda-forge/osx-64/pybtex-docutils-1.0.1-py39h6e9494a_1.tar.bz2#f000ab7da2a0177808ebc7225b4490e5 -https://conda.anaconda.org/t//conda-forge/noarch/pyopenssl-22.0.0-pyhd8ed1ab_0.tar.bz2#1d7e241dfaf5475e893d4b824bb71b44 -https://conda.anaconda.org/t//conda-forge/osx-64/scipy-1.8.0-py39h056f1c0_1.tar.bz2#1f27c3576dc5d94c2e089bd2a9e47f16 -https://conda.anaconda.org/t//conda-forge/noarch/wcwidth-0.2.5-pyh9f0ad1d_2.tar.bz2#5266fcd697043c59621fda522b3d78ee -https://conda.anaconda.org/t//conda-forge/noarch/altair-4.2.0-pyhd8ed1ab_1.tar.bz2#2867acfe48ceb3630b163632914720d9 -https://conda.anaconda.org/t//conda-forge/noarch/jupytext-1.13.8-pyh4b9bcc7_0.tar.bz2#aba00353637aa69640ec2ae150dc592d -https://conda.anaconda.org/t//conda-forge/noarch/nbclient-0.5.13-pyhd8ed1ab_0.tar.bz2#3edde88a191701cf052216c4ba353a83 -https://conda.anaconda.org/t//conda-forge/noarch/prompt-toolkit-3.0.29-pyha770c72_0.tar.bz2#9e720b57b22ef3032b4fb081697819dd -https://conda.anaconda.org/t//conda-forge/osx-64/scikit-learn-1.0.2-py39hd4eea88_0.tar.bz2#bffc17fe77541f4c7bf0ae766b69ea36 -https://conda.anaconda.org/t//conda-forge/noarch/urllib3-1.26.9-pyhd8ed1ab_0.tar.bz2#0ea179ee251aa7100807c35bc0252693 -https://conda.anaconda.org/t//conda-forge/noarch/altair_data_server-0.4.1-py_0.tar.bz2#5412ec3d2792d3e2f18e075cb05ffdaf -https://conda.anaconda.org/t//conda-forge/osx-64/ipython-8.3.0-py39h6e9494a_0.tar.bz2#3d1dfb144dae1eba434cade94fb5f1e5 -https://conda.anaconda.org/t//conda-forge/noarch/nbconvert-core-6.5.0-pyhd8ed1ab_0.tar.bz2#42f74c4b38a099025167e76a7437edf1 -https://conda.anaconda.org/t//conda-forge/noarch/requests-2.27.1-pyhd8ed1ab_0.tar.bz2#7c1c427246b057b8fa97200ecdb2ed62 -https://conda.anaconda.org/t//conda-forge/noarch/cachecontrol-0.12.11-pyhd8ed1ab_0.tar.bz2#6eefee9888f33f150b5d44d616b1a613 -https://conda.anaconda.org/t//conda-forge/osx-64/conda-4.12.0-py39h6e9494a_0.tar.bz2#8aa039a63e9d764ea3758530b4388066 -https://conda.anaconda.org/t//conda-forge/noarch/ensureconda-1.4.2-pyhd8ed1ab_0.tar.bz2#1bf97b25d058482fd73e62b1cdb932ef -https://conda.anaconda.org/t//conda-forge/osx-64/ipykernel-6.13.0-py39h71a6800_0.tar.bz2#79211930da8afc1c75f8033711f3b60d -https://conda.anaconda.org/t//conda-forge/noarch/jupyter_server-1.17.0-pyhd8ed1ab_0.tar.bz2#276b3b45443e2a84cfb4d128cb86c350 -https://conda.anaconda.org/t//conda-forge/noarch/nbconvert-pandoc-6.5.0-pyhd8ed1ab_0.tar.bz2#d7421adfc67100021d87032447066129 -https://conda.anaconda.org/t//conda-forge/noarch/pooch-1.6.0-pyhd8ed1ab_0.tar.bz2#6429e1d1091c51f626b5dcfdd38bf429 -https://conda.anaconda.org/t//conda-forge/noarch/requests-toolbelt-0.9.1-py_0.tar.bz2#402668adee8fcba9a9c265cdc2a88f5a -https://conda.anaconda.org/t//conda-forge/noarch/sphinx-4.5.0-pyh6c4a22f_0.tar.bz2#46b38d88c4270ff9ba78a89c83c66345 -https://conda.anaconda.org/t//conda-forge/noarch/jupyter-server-mathjax-0.2.5-pyhc268e32_0.tar.bz2#0393370c2dec5e92e1727a8650f908f7 -https://conda.anaconda.org/t//conda-forge/noarch/jupyterlab_server-2.13.0-pyhd8ed1ab_1.tar.bz2#ecead930bfd8c0f629c5b8bf5c1e3508 -https://conda.anaconda.org/t//conda-forge/osx-64/mamba-0.23.0-py39ha435c47_1.tar.bz2#e9f0f616c303e9bc41ef43f509a167f2 -https://conda.anaconda.org/t//conda-forge/noarch/myst-parser-0.15.2-pyhd8ed1ab_0.tar.bz2#0c2976e0a1af80ce224388da557eeece -https://conda.anaconda.org/t//conda-forge/noarch/nbconvert-6.5.0-pyhd8ed1ab_0.tar.bz2#156c180588e38b9f41758058824ec50f -https://conda.anaconda.org/t//conda-forge/noarch/notebook-shim-0.1.0-pyhd8ed1ab_0.tar.bz2#3a8e2c7dcc674f2cb0784f1faba57055 -https://conda.anaconda.org/t//conda-forge/osx-64/poetry-1.1.13-py39h6e9494a_1.tar.bz2#32d8696f9943cca10d86a74181407615 -https://conda.anaconda.org/t//conda-forge/noarch/pydata-sphinx-theme-0.7.2-pyhd8ed1ab_0.tar.bz2#123f5c52d6b4117225af45a52ec34997 -https://conda.anaconda.org/t//conda-forge/noarch/sphinx-comments-0.0.3-pyh9f0ad1d_0.tar.bz2#2ae3ce35de0c1cec45c94182694f8d1b -https://conda.anaconda.org/t//conda-forge/noarch/sphinx-copybutton-0.5.0-pyhd8ed1ab_0.tar.bz2#4c969cdd5191306c269490f7ff236d9c -https://conda.anaconda.org/t//conda-forge/noarch/sphinx-external-toc-0.2.4-pyhd8ed1ab_0.tar.bz2#91ae8770569b73f25e1127526c1329ed -https://conda.anaconda.org/t//conda-forge/noarch/sphinx-jupyterbook-latex-0.4.6-pyhd8ed1ab_0.tar.bz2#6e4a69a0c8adbb48178fdf3efa24fa4c -https://conda.anaconda.org/t//conda-forge/noarch/sphinx-multitoc-numbering-0.1.3-pyhd8ed1ab_0.tar.bz2#40749a4d0f0d2e11c65fb26c1cd16a90 -https://conda.anaconda.org/t//conda-forge/noarch/sphinx-panels-0.6.0-pyhd8ed1ab_0.tar.bz2#6eec6480601f5d15babf9c3b3987f34a -https://conda.anaconda.org/t//conda-forge/noarch/sphinx-thebe-0.1.2-pyhd8ed1ab_0.tar.bz2#1d4fdd342aa955085a0f21e26bb585f7 -https://conda.anaconda.org/t//conda-forge/noarch/sphinx-togglebutton-0.3.1-pyhd8ed1ab_0.tar.bz2#71418887aa6599ea2935f4958e5e1d15 -https://conda.anaconda.org/t//conda-forge/noarch/sphinxcontrib-bibtex-2.4.2-pyhd8ed1ab_0.tar.bz2#d826ac2b3edfe7a8113596c2023f092b -https://conda.anaconda.org/t//conda-forge/noarch/conda-lock-1.0.5-pyhd8ed1ab_0.tar.bz2#7544764bbf4941cc954bb80911f3b201 -https://conda.anaconda.org/t//conda-forge/noarch/nbdime-3.1.1-pyhd8ed1ab_0.tar.bz2#38dc061ffabe665b79f4c7c52cefa809 -https://conda.anaconda.org/t//conda-forge/noarch/notebook-6.4.11-pyha770c72_0.tar.bz2#da25720a88aa3cbb3e16df740783da74 -https://conda.anaconda.org/t//conda-forge/noarch/sphinx-book-theme-0.1.10-pyhd8ed1ab_1.tar.bz2#194ec0159031da65b653c665bb1678f6 -https://conda.anaconda.org/t//conda-forge/noarch/jupyter-cache-0.4.3-pyhd8ed1ab_0.tar.bz2#03cd9218c96d513854bfc8714eaf9451 -https://conda.anaconda.org/t//conda-forge/noarch/jupyter_contrib_core-0.3.3-py_2.tar.bz2#4704781b0a914d67e4ae8e9f9f5c37a0 -https://conda.anaconda.org/t//conda-forge/noarch/nbclassic-0.3.7-pyhd8ed1ab_0.tar.bz2#a8a7139140a7512c90514444444a4991 -https://conda.anaconda.org/t//conda-forge/osx-64/widgetsnbextension-3.6.0-py39h6e9494a_0.tar.bz2#2b05800adc9c5d71f8c792167bf0556d -https://conda.anaconda.org/t//conda-forge/noarch/ipywidgets-7.7.0-pyhd8ed1ab_0.tar.bz2#a3d2ccd3d9f9fcb65765c22f500529b4 -https://conda.anaconda.org/t//conda-forge/osx-64/jupyter_highlight_selected_word-0.2.0-py39h6e9494a_1005.tar.bz2#c054e75a5b8524199cfb8f194acf5832 -https://conda.anaconda.org/t//conda-forge/noarch/jupyter_latex_envs-1.4.6-pyhd8ed1ab_1002.tar.bz2#4b888fd7d6b4cdb6736878b2cf8ea951 -https://conda.anaconda.org/t//conda-forge/noarch/jupyter_nbextensions_configurator-0.4.1-pyhd8ed1ab_2.tar.bz2#19a2fa481008976df3ed8ce5d4dfb8fa -https://conda.anaconda.org/t//conda-forge/noarch/jupyterlab-3.4.0-pyhd8ed1ab_0.tar.bz2#79d9efa21a8dbe3f9fdbe8069a61ac26 -https://conda.anaconda.org/t//conda-forge/noarch/jupyter-sphinx-0.3.2-pyhd8ed1ab_1.tar.bz2#a47ca0f91417e5d29d075ca416254466 -https://conda.anaconda.org/t//conda-forge/noarch/jupyter_contrib_nbextensions-0.5.1-pyhd8ed1ab_2.tar.bz2#ee3820cb73867efb8c928b93e55f4de3 -https://conda.anaconda.org/t//conda-forge/noarch/myst-nb-0.13.2-pyhd8ed1ab_0.tar.bz2#800e968e63eb593c651a14907fd82d26 -https://conda.anaconda.org/t//conda-forge/noarch/jupyter-book-0.12.3-pyhd8ed1ab_0.tar.bz2#1d6c5efa323c2cfd6196af5524aa5b78 diff --git a/unused/install/environment.yml b/unused/install/environment.yml deleted file mode 100644 index c05e6868..00000000 --- a/unused/install/environment.yml +++ /dev/null @@ -1,25 +0,0 @@ -name: dsci100 -channels: - - eoas_ubc - - conda-forge -dependencies: - - altair - - altair_data_server - - click - - conda-lock - - ghp-import - - git - - jupyterlab - - jupytext - - jupyter_contrib_nbextensions - - mamba - - nodejs - - notebook - - numpy - - pandas - - pip - - python<3.10 - - jinja2==3.0.3 - - scikit-learn - - openpyxl -# conda-lock --kind explicit --file environment.yml -p linux-64 diff --git a/unused/install/requirements.txt b/unused/install/requirements.txt deleted file mode 100644 index 56cee422..00000000 --- a/unused/install/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -git+https://github.com/eoas-ubc/jb_tools.git@master diff --git a/unused/notes/Readme_githubio.md b/unused/notes/Readme_githubio.md deleted file mode 100644 index a267c4d0..00000000 --- a/unused/notes/Readme_githubio.md +++ /dev/null @@ -1,61 +0,0 @@ -# building an html book for github.io - -## Setup - -1. Fork https://github.com/UBC-DSCI/introduction-to-datascience-python -2. Upload a ssh public key to your github account -3. Edit your .ssh/config to allow ssh read/write to github with your user id -- using a nickname like phaustin for your github host entry: - -```bash - Host phaustin - HostName github.com - User git - IdentityFile ~/.ssh/new_pha_git - IdentitiesOnly yes -``` - -4. Clone your fork using ssh: - -```bash - git clone phaustin:phaustin/introduction-to-datascience-python -``` - -5. Add the upstream remote: - -```bash - git remote add upstream phaustin:UBC-DSCI/introduction-to-datascience-python -``` - -6. Fetch the upstream branches - -```bash - git fetch upstream -``` - -6. Create the topic branch: - -```bash - git checkout -b classification1 origin/classification1 -``` - -7. If necessary (i.e. if someone has pushed new changes to main), rebase origin on upstream - -```bash - git rebase upstream/classification1 -``` - -8. Build the book and push the html to gh-pages - -```bash - jb build source - ./push_html.sh -``` - -9. In the setting for your forked repo, turn on github pages and set it to source the gh-pages branch - -10. Point your browser to the fork's github.io address: - - https://phaustin.github.io/introduction-to-datascience-python/classification1.html# - - -** note that it might take a minute or two for github to overwrite the old html on their server. There can also be browser cache issues. I use the "clear cache" extension on chrome and also do a hard refresh: see https://fabricdigital.co.nz/blog/how-to-hard-refresh-your-browser-and-clear-cache. I also sometimes write a version number in the top header so I know that I'm looking at the current version. You can also use an incognito window to be sure you've got a fresh session. diff --git a/unused/notes/Readme_install.md b/unused/notes/Readme_install.md deleted file mode 100644 index 78f92680..00000000 --- a/unused/notes/Readme_install.md +++ /dev/null @@ -1,72 +0,0 @@ -# building the book - -## fork or clone the repository - -* git clone https://github.com/UBC-DSCI/introduction-to-datascience-python.git - cd introduction-to-datascience-python - git checkout -b myst - -## installing the build environment - -1) Download miniforge from https://github.com/conda-forge/miniforge/releases - -2) install and activate the base environment in a shell - -3) do: - - cd install - mamba env create --name dsci --file environment.yml - conda activate dsci - pip install -r requirements.txt - npm install -g live-server - -This should give you an environment with jupyter-book, scikit-learn, and a live reload server which can automatically build the book following https://github.com/eoas-ubc/jb_tools/blob/master/tools_demo/Readme_conda.md - -4) If you'd like to produce and use a lock file, do: - - conda-lock --kind explicit --file environment.yml -p ostype - - where ostype is one of `linux-64`, `win-64` or `osx-64`. Then use the lockfile with: - - mamba activate base - mamba install --name dsci --file conda-lock-ostype - -## building the book - -5) do: - - cd source - jb build . - -6) view the html in `./_build/html` - -## publishing the book - -7) do: - - cd introduction-to-datascience-python - ./push_html.sh - - this will update the github.io version at: https://phaustin.github.io/introduction-to-datascience-python/intro.html - - -## working with livereload - -To have the book built locally when you change a file: - -8) in one terminal, start a source file watcher: - - cd introduction-to-datascience-python - ebp-watch jb source - -9) in another terminal, use live-server to open a local browser tab with yoru book - - live-server source/_build/html/ - -After you do this, changing any file in the source folder should trigger a jb build and browser refresh. - - - - - - diff --git a/unused/notes/Readme_overview.md b/unused/notes/Readme_overview.md deleted file mode 100644 index 944b668c..00000000 --- a/unused/notes/Readme_overview.md +++ /dev/null @@ -1,40 +0,0 @@ -# OCESE summer notes - -## resource links - -### DSCI 100 - -* the instructor repo (ubc enterprise github) https://github.ubc.ca/dsci-100-instructor -* the student repo) https://github.com/ubc-dsci/dsci-100-student - -### python textbook repo and forks - -* https://github.com/ubc-dsci/introduction-to-datascience-python -* https://github.com/lheagy/introduction-to-datascience-python -* https://github.com/phaustin/introduction-to-datascience-python - -#### textbook branches - -* dev: https://github.com/ubc-dsci/introduction-to-datascience-python/tree/dev -* chapter 2 (reading,navya): https://github.com/ubc-dsci/introduction-to-datascience-python/tree/reading - * tutorial: https://github.ubc.ca/UBC-DSCI/dsci-100-instructor/tree/py_tutorial_reading - * worksheet: https://github.ubc.ca/UBC-DSCI/dsci-100-instructor/tree/py_worksheet_reading - * issue: https://github.com/UBC-DSCI/introduction-to-datascience-python/issues/4 -* chapter 5 (classification1, gloria): https://github.com/ubc-dsci/introduction-to-datascience-python/tree/classification1 - * tutorial: https://github.ubc.ca/UBC-DSCI/dsci-100-instructor/tree/py_tutorial_classification1 - * worksheet: https://github.ubc.ca/UBC-DSCI/dsci-100-instructor/tree/py_worksheet_classification1 - * issue: https://github.com/UBC-DSCI/introduction-to-datascience-python/issues/3 - - -### R textbook - -* the R textbook repo: https://github.com/ubc-dsci/introduction-to-datascience -* rendered textbook (R version): https://datasciencebook.ca/ - - -### nbgrader - -* rudaux: https://github.com/UBC-DSCI/rudaux -* autotest (private): https://github.com/ubc-dsci/autotest - - diff --git a/unused/notes/figure-caption.ipynb b/unused/notes/figure-caption.ipynb deleted file mode 100644 index b6d0b2f6..00000000 --- a/unused/notes/figure-caption.ipynb +++ /dev/null @@ -1,252 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Test notebook for figure captions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - ":::{figure-md} figception\n", - "\"figs\"\n", - "\n", - "A fig of a fig.\n", - ":::" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Testing a ref to a fig here {ref}`figception` and a numbered fig here {numref}`figception`." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from matplotlib import rcParams, cycler\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "plt.ion()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
    " - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# Fixing random state for reproducibility\n", - "np.random.seed(19680801)\n", - "\n", - "N = 10\n", - "data = [np.logspace(0, 1, 100) + np.random.randn(100) + ii for ii in range(N)]\n", - "data = np.array(data).T\n", - "cmap = plt.cm.coolwarm\n", - "rcParams['axes.prop_cycle'] = cycler(color=cmap(np.linspace(0, 1, N)))\n", - "\n", - "\n", - "from matplotlib.lines import Line2D\n", - "custom_lines = [Line2D([0], [0], color=cmap(0.), lw=4),\n", - " Line2D([0], [0], color=cmap(.5), lw=4),\n", - " Line2D([0], [0], color=cmap(1.), lw=4)]\n", - "\n", - "fig, ax = plt.subplots(figsize=(10, 5))\n", - "lines = ax.plot(data)\n", - "ax.legend(custom_lines, ['Cold', 'Medium', 'Hot']);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```{figure} \n", - ":name: mpl\n", - ":figclass: caption-hack\n", - "\n", - "Testing an mpl plot\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Ref to the mpl fig here {ref}`mpl` and {numref}`mpl`." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
    \n", - "" - ], - "text/plain": [ - "alt.Chart(...)" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import altair as alt\n", - "from vega_datasets import data\n", - "\n", - "source = data.cars()\n", - "\n", - "alt.Chart(source).mark_circle(size=60).encode(\n", - " x='Horsepower',\n", - " y='Miles_per_Gallon',\n", - " color='Origin',\n", - " tooltip=['Name', 'Origin', 'Horsepower', 'Miles_per_Gallon']\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```{figure} \n", - ":name: altair\n", - ":figclass: caption-hack\n", - "\n", - "Testing an Altair plot.\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Ref to the mpl fig here {ref}`altair` and {numref}`altair`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "celltoolbar": "Edit Metadata", - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.4" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "state": {}, - "version_major": 2, - "version_minor": 0 - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -}