diff --git a/.gitignore b/.gitignore index d951f3fb9cbad..874ec9a6d1113 100644 --- a/.gitignore +++ b/.gitignore @@ -141,3 +141,10 @@ doc/source/savefig/ # Pyodide/WASM related files # ############################## /.pyodide-xbuildenv-* + +# Web # +####### +web/pandas/content/en/pdeps +web/pandas/content/en/static +web/pandas/content/pt/pdeps +web/pandas/content/pt/static diff --git a/web/pandas/_templates/layout.html b/web/pandas/_templates/layout.html index c26b093b0c4ba..82ca9c4c589b9 100644 --- a/web/pandas/_templates/layout.html +++ b/web/pandas/_templates/layout.html @@ -1,5 +1,5 @@ - + pandas - Python Data Analysis Library @@ -15,6 +15,8 @@ href="{{ base_url }}{{ stylesheet }}"> {% endfor %} + +
@@ -50,6 +52,8 @@ {% endif %} {% endfor %} + +
diff --git a/web/pandas/about/citing.md b/web/pandas/content/en/about/citing.md similarity index 100% rename from web/pandas/about/citing.md rename to web/pandas/content/en/about/citing.md diff --git a/web/pandas/about/governance.md b/web/pandas/content/en/about/governance.md similarity index 100% rename from web/pandas/about/governance.md rename to web/pandas/content/en/about/governance.md diff --git a/web/pandas/about/index.md b/web/pandas/content/en/about/index.md similarity index 100% rename from web/pandas/about/index.md rename to web/pandas/content/en/about/index.md diff --git a/web/pandas/about/roadmap.md b/web/pandas/content/en/about/roadmap.md similarity index 100% rename from web/pandas/about/roadmap.md rename to web/pandas/content/en/about/roadmap.md diff --git a/web/pandas/about/sponsors.md b/web/pandas/content/en/about/sponsors.md similarity index 100% rename from web/pandas/about/sponsors.md rename to web/pandas/content/en/about/sponsors.md diff --git a/web/pandas/about/team.md b/web/pandas/content/en/about/team.md similarity index 100% rename from web/pandas/about/team.md rename to web/pandas/content/en/about/team.md diff --git a/web/pandas/community/benchmarks.md b/web/pandas/content/en/community/benchmarks.md similarity index 100% rename from web/pandas/community/benchmarks.md rename to web/pandas/content/en/community/benchmarks.md diff --git a/web/pandas/community/blog/2019-user-survey.md b/web/pandas/content/en/community/blog/2019-user-survey.md similarity index 100% rename from web/pandas/community/blog/2019-user-survey.md rename to web/pandas/content/en/community/blog/2019-user-survey.md diff --git a/web/pandas/community/blog/asv-pandas-grant.md b/web/pandas/content/en/community/blog/asv-pandas-grant.md similarity index 100% rename from web/pandas/community/blog/asv-pandas-grant.md rename to web/pandas/content/en/community/blog/asv-pandas-grant.md diff --git a/web/pandas/community/blog/extension-arrays.md b/web/pandas/content/en/community/blog/extension-arrays.md similarity index 100% rename from web/pandas/community/blog/extension-arrays.md rename to web/pandas/content/en/community/blog/extension-arrays.md diff --git a/web/pandas/community/blog/index.html b/web/pandas/content/en/community/blog/index.html similarity index 100% rename from web/pandas/community/blog/index.html rename to web/pandas/content/en/community/blog/index.html diff --git a/web/pandas/community/blog/pandas-1.0.md b/web/pandas/content/en/community/blog/pandas-1.0.md similarity index 100% rename from web/pandas/community/blog/pandas-1.0.md rename to web/pandas/content/en/community/blog/pandas-1.0.md diff --git a/web/pandas/community/coc.md b/web/pandas/content/en/community/coc.md similarity index 100% rename from web/pandas/community/coc.md rename to web/pandas/content/en/community/coc.md diff --git a/web/pandas/community/ecosystem.md b/web/pandas/content/en/community/ecosystem.md similarity index 100% rename from web/pandas/community/ecosystem.md rename to web/pandas/content/en/community/ecosystem.md diff --git a/web/pandas/config.yml b/web/pandas/content/en/config.yml similarity index 99% rename from web/pandas/config.yml rename to web/pandas/content/en/config.yml index 679778330b68d..e544b491bb1e9 100644 --- a/web/pandas/config.yml +++ b/web/pandas/content/en/config.yml @@ -1,4 +1,5 @@ main: + base_url: "en" templates_path: _templates base_template: "layout.html" production_url: "https://pandas.pydata.org/" @@ -58,6 +59,7 @@ navbar: target: community/benchmarks.html - name: "Contribute" target: contribute.html + blog: num_posts: 50 posts_path: community/blog diff --git a/web/pandas/contribute.md b/web/pandas/content/en/contribute.md similarity index 100% rename from web/pandas/contribute.md rename to web/pandas/content/en/contribute.md diff --git a/web/pandas/content/en/donate.md b/web/pandas/content/en/donate.md new file mode 100644 index 0000000000000..69db7e4648e77 --- /dev/null +++ b/web/pandas/content/en/donate.md @@ -0,0 +1,14 @@ +# Donate to pandas + +
+
+ + +_pandas_ is a Sponsored Project of [NumFOCUS](https://numfocus.org/), a 501(c)(3) nonprofit charity in the United States. +NumFOCUS provides _pandas_ with fiscal, legal, and administrative support to help ensure the +health and sustainability of the project. Visit numfocus.org for more information. + +Donations to _pandas_ are managed by NumFOCUS. For donors in the United States, your gift is tax-deductible +to the extent provided by law. As with any donation, you should consult with your tax adviser about your particular tax situation. diff --git a/web/pandas/getting_started.md b/web/pandas/content/en/getting_started.md similarity index 100% rename from web/pandas/getting_started.md rename to web/pandas/content/en/getting_started.md diff --git a/web/pandas/index.html b/web/pandas/content/en/index.html similarity index 100% rename from web/pandas/index.html rename to web/pandas/content/en/index.html diff --git a/web/pandas/pdeps/0001-purpose-and-guidelines.md b/web/pandas/content/pdeps/0001-purpose-and-guidelines.md similarity index 100% rename from web/pandas/pdeps/0001-purpose-and-guidelines.md rename to web/pandas/content/pdeps/0001-purpose-and-guidelines.md diff --git a/web/pandas/pdeps/0004-consistent-to-datetime-parsing.md b/web/pandas/content/pdeps/0004-consistent-to-datetime-parsing.md similarity index 100% rename from web/pandas/pdeps/0004-consistent-to-datetime-parsing.md rename to web/pandas/content/pdeps/0004-consistent-to-datetime-parsing.md diff --git a/web/pandas/pdeps/0005-no-default-index-mode.md b/web/pandas/content/pdeps/0005-no-default-index-mode.md similarity index 100% rename from web/pandas/pdeps/0005-no-default-index-mode.md rename to web/pandas/content/pdeps/0005-no-default-index-mode.md diff --git a/web/pandas/pdeps/0006-ban-upcasting.md b/web/pandas/content/pdeps/0006-ban-upcasting.md similarity index 100% rename from web/pandas/pdeps/0006-ban-upcasting.md rename to web/pandas/content/pdeps/0006-ban-upcasting.md diff --git a/web/pandas/pdeps/0007-copy-on-write.md b/web/pandas/content/pdeps/0007-copy-on-write.md similarity index 100% rename from web/pandas/pdeps/0007-copy-on-write.md rename to web/pandas/content/pdeps/0007-copy-on-write.md diff --git a/web/pandas/pdeps/0009-io-extensions.md b/web/pandas/content/pdeps/0009-io-extensions.md similarity index 100% rename from web/pandas/pdeps/0009-io-extensions.md rename to web/pandas/content/pdeps/0009-io-extensions.md diff --git a/web/pandas/pdeps/0010-required-pyarrow-dependency.md b/web/pandas/content/pdeps/0010-required-pyarrow-dependency.md similarity index 100% rename from web/pandas/pdeps/0010-required-pyarrow-dependency.md rename to web/pandas/content/pdeps/0010-required-pyarrow-dependency.md diff --git a/web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md b/web/pandas/content/pdeps/0012-compact-and-reversible-JSON-interface.md similarity index 100% rename from web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md rename to web/pandas/content/pdeps/0012-compact-and-reversible-JSON-interface.md diff --git a/web/pandas/pdeps/0014-string-dtype.md b/web/pandas/content/pdeps/0014-string-dtype.md similarity index 100% rename from web/pandas/pdeps/0014-string-dtype.md rename to web/pandas/content/pdeps/0014-string-dtype.md diff --git a/web/pandas/pdeps/0017-backwards-compatibility-and-deprecation-policy.md b/web/pandas/content/pdeps/0017-backwards-compatibility-and-deprecation-policy.md similarity index 100% rename from web/pandas/pdeps/0017-backwards-compatibility-and-deprecation-policy.md rename to web/pandas/content/pdeps/0017-backwards-compatibility-and-deprecation-policy.md diff --git a/web/pandas/content/pt/about/citing.md b/web/pandas/content/pt/about/citing.md new file mode 100644 index 0000000000000..a3c470d05e55f --- /dev/null +++ b/web/pandas/content/pt/about/citing.md @@ -0,0 +1,127 @@ +# Citing and logo + +## Citing pandas + +If you use _pandas_ for a scientific publication, we would appreciate citations to the published software and the +following paper: + +- [pandas on Zenodo](https://zenodo.org/search?page=1&size=20&q=conceptrecid%3A%223509134%22&sort=-version&all_versions=True), + Please find us on Zenodo and replace with the citation for the version you are using. You can replace the full author + list from there with "The pandas development team" like in the example below. + + @software{reback2020pandas, + author = {The pandas development team}, + title = {pandas-dev/pandas: Pandas}, + month = feb, + year = 2020, + publisher = {Zenodo}, + version = {latest}, + doi = {10.5281/zenodo.3509134}, + url = {https://doi.org/10.5281/zenodo.3509134} + } + +- [Data structures for statistical computing in python](https://pub.curvenote.com/01908378-3686-7168-a380-d82bbf21c799/public/mckinney-57fc0d4e8a08cd7f26a4b8bf468a71f4.pdf), + McKinney, Proceedings of the 9th Python in Science Conference, Volume 445, 2010. + + @InProceedings{ mckinney-proc-scipy-2010, + author = { {W}es {M}c{K}inney }, + title = { {D}ata {S}tructures for {S}tatistical {C}omputing in {P}ython }, + booktitle = { {P}roceedings of the 9th {P}ython in {S}cience {C}onference }, + pages = { 56 - 61 }, + year = { 2010 }, + editor = { {S}t\'efan van der {W}alt and {J}arrod {M}illman }, + doi = { 10.25080/Majora-92bf1922-00a } + } + +## Brand and logo + +When using the project name _pandas_, please use it in lower case, even at the beginning of a sentence. + +The official logos of _pandas_ are: + +### Primary logo + + + + + + + + +### Secondary logo + + + + + + + + +### Logo mark + + + + + + + + +### Logo usage + +The pandas logo is available in full color and white accent. +The full color logo should only appear against white backgrounds. +The white accent logo should go against contrasting color background. + +When using the logo, please follow the next directives: + +- Primary logo should never be seen under 1 inch in size for printing and 72px for web +- The secondary logo should never be seen under 0.75 inch in size for printing and 55px for web +- Leave enough margin around the logo (leave the height of the logo in the top, bottom and both sides) +- Do not distort the logo by changing its proportions +- Do not place text or other elements on top of the logo + +### Colors + + + + + + + +
+ + + +
+ Blue
+ RGB: R21 G4 B88
+ HEX: #150458 +
+ + + +
+ Yellow
+ RGB: R255 G202 B0
+ HEX: #FFCA00 +
+ + + +
+ Pink
+ RGB: R231 G4 B136
+ HEX: #E70488 +
diff --git a/web/pandas/content/pt/about/governance.md b/web/pandas/content/pt/about/governance.md new file mode 100644 index 0000000000000..d8777d1d0c15d --- /dev/null +++ b/web/pandas/content/pt/about/governance.md @@ -0,0 +1,317 @@ +# Project governance + +The official version of this document, along with a list of +individuals and institutions in the roles defined in the governance +section below, is contained in the +[Project governance]({{ base_url }}about/governance.html) +page of the pandas website. + +## The Project + +The pandas Project (The Project) is an open source software project affiliated +with the 501(c)3 NumFOCUS Foundation. The goal of The Project is to develop open +source software for data ingest, data preparation, data analysis, and data +visualization for the Python programming language. The Software developed by +The Project is released under the BSD (or similar) open source license, +developed openly and hosted in public GitHub repositories under the [pandas +GitHub organization](https://github.com/pandas-dev). Examples of Project Software +include the main pandas code repository and the pandas-stubs library. + +Through its affiliation with NumFOCUS, The Project has the right to receive +tax-deductible donations in the United States of America. + +The Project is developed by a team of distributed developers, called +Contributors. Contributors are individuals who have contributed code, +documentation, designs or other work to one or more Project repositories. +Anyone can be a Contributor. Contributors can be affiliated with any legal +entity or none. Contributors participate in the project by submitting, +reviewing and discussing GitHub Pull Requests and Issues and participating in +open and public Project discussions on GitHub, mailing lists, and +elsewhere. The foundation of Project participation is openness and +transparency. + +Here is a list of the current Contributors to the main pandas repository: + +[https://github.com/pandas-dev/pandas/graphs/contributors](https://github.com/pandas-dev/pandas/graphs/contributors) + +There are also many other Contributors listed in the logs of other repositories of +the pandas project. + +The Project Community consists of all Contributors and Users of the Project. +Contributors work on behalf of and are responsible to the larger Project +Community and we strive to keep the barrier between Contributors and Users as +low as possible. + +The Project is formally affiliated with the 501(c)3 NumFOCUS Foundation +([https://numfocus.org](https://numfocus.org)), which serves as its fiscal +sponsor, may hold project trademarks and other intellectual property, helps +manage project donations and acts as a parent legal entity. NumFOCUS is the +only legal entity that has a formal relationship with the project (see +Institutional Partners section below). + +## Governance + +This section describes the governance and leadership model of The Project. + +The foundations of Project governance are: + +- Openness & Transparency +- Active Contribution +- Institutional Neutrality + +Traditionally, Project leadership was provided by a BDFL (Wes McKinney) and +subset of Contributors, called the Core Team, whose active and consistent +contributions have been recognized by their receiving “commit rights” to the +Project GitHub repositories. In general all Project decisions are made through +consensus among the Core Team with input from the Community. The BDFL can, but +rarely chooses to, override the Core Team and make a final decision on a +matter. + +While this approach has served us well, as the Project grows and faces more +legal and financial decisions and interacts with other institutions, we see a +need for a more formal governance model. Moving forward The Project leadership +will consist of a BDFL and Core Team. We view this governance model as the +formalization of what we are already doing, rather than a change in direction. + +### BDFL + +The Project will have a BDFL (Benevolent Dictator for Life), who is currently +Wes McKinney. As Dictator, the BDFL has the authority to make all final +decisions for The Project. As Benevolent, the BDFL, in practice chooses to +defer that authority to the consensus of the community discussion channels and +the Core Team. It is expected, and in the past has been the case, that the BDFL +will only rarely assert his/her final authority. Because it is rarely used, we +refer to BDFL’s final authority as a “special” or “overriding” vote. When it +does occur, the BDFL override typically happens in situations where there is a +deadlock in the Core Team or if the Core Team ask the BDFL to make a decision +on a specific matter. To ensure the benevolence of the BDFL, The Project +encourages others to fork the project if they disagree with the overall +direction the BDFL is taking. The BDFL is chair of the Core Team (see below) +and may delegate his/her authority on a particular decision or set of decisions +to any other Core Team Member at his/her discretion. + +The BDFL can appoint his/her successor, but it is expected that the Core Team +would be consulted on this decision. If the BDFL is unable to appoint a +successor (e.g. due to death or illness), the Core Team will choose a successor +by voting with at least 2/3 of the Core Team members voting in favor of the +chosen successor. At least 80% of the Core Team must participate in the +vote. If no BDFL candidate receives 2/3 of the votes of the Core Team, the Core +Team members shall propose the BDFL candidates to the Main NumFOCUS board, who +will then make the final decision. + +### Core Team + +The Project's Core Team will consist of Project Contributors who have produced +contributions that are substantial in quality and quantity, and sustained over +at least one year. The overall role of the Core Team is to ensure, through +working with the BDFL and taking input from the Community, the long-term +well-being of the project, both technically and as a community. + +During the everyday project activities, Core Team participate in all +discussions, code review and other project activities as peers with all other +Contributors and the Community. In these everyday activities, Core Team do not +have any special power or privilege through their membership on the Core +Team. However, it is expected that because of the quality and quantity of their +contributions and their expert knowledge of the Project Software that the Core +Team will provide useful guidance, both technical and in terms of project +direction, to potentially less experienced contributors. + +The Core Team and its Members play a special role in certain situations. +In particular, the Core Team may: + +- Make decisions about the overall scope, vision and direction of the + project. +- Make decisions about strategic collaborations with other organizations or + individuals. +- Make decisions about specific technical issues, features, bugs and pull + requests. They are the primary mechanism of guiding the code review process + and merging pull requests. +- Make decisions about the Services that are run by The Project and manage + those Services for the benefit of the Project and Community. +- Make decisions when regular community discussion doesn't produce consensus + on an issue in a reasonable time frame. + +### Core Team membership + +To become eligible for being a Core Team Member an individual must be a Project +Contributor who has produced contributions that are substantial in quality and +quantity, and sustained over at least one year. Potential Core Team Members are +nominated by existing Core members and voted upon by the existing Core Team +after asking if the potential Member is interested and willing to serve in that +capacity. The Core Team will be initially formed from the set of existing +Contributors who have been granted commit rights as of late 2015. + +When considering potential Members, the Core Team will look at candidates with +a comprehensive view of their contributions. This will include but is not +limited to code, code review, infrastructure work, mailing list and chat +participation, community help/building, education and outreach, design work, +etc. We are deliberately not setting arbitrary quantitative metrics (like “100 +commits in this repo”) to avoid encouraging behavior that plays to the metrics +rather than the project’s overall well-being. We want to encourage a diverse +array of backgrounds, viewpoints and talents in our team, which is why we +explicitly do not define code as the sole metric on which Core Team membership +will be evaluated. + +If a Core Team member becomes inactive in the project for a period of one year, +they will be considered for removal from the Core Team. Before removal, +inactive Member will be approached by the BDFL to see if they plan on returning +to active participation. If not they will be removed immediately upon a Core +Team vote. If they plan on returning to active participation soon, they will be +given a grace period of one year. If they don't return to active participation +within that time period they will be removed by vote of the Core Team without +further grace period. All former Core Team members can be considered for +membership again at any time in the future, like any other Project Contributor. +Retired Core Team members will be listed on the project website, acknowledging +the period during which they were active in the Core Team. + +The Core Team reserves the right to eject current Members, other than the BDFL, +if they are deemed to be actively harmful to the project’s well-being, and +attempts at communication and conflict resolution have failed. + +### Conflict of interest + +It is expected that the BDFL and Core Team Members will be employed at a wide +range of companies, universities and non-profit organizations. Because of this, +it is possible that Members will have conflict of interests. Such conflict of +interests include, but are not limited to: + +- Financial interests, such as investments, employment or contracting work, + outside of The Project that may influence their work on The Project. +- Access to proprietary information of their employer that could potentially + leak into their work with the Project. + +All members of the Core Team, BDFL included, shall disclose to the rest of the +Core Team any conflict of interest they may have. Members with a conflict of +interest in a particular issue may participate in Core Team discussions on that +issue, but must recuse themselves from voting on the issue. If the BDFL has +recused his/herself for a particular decision, they will appoint a substitute +BDFL for that decision. + +### Private communications of the Core Team + +Unless specifically required, all Core Team discussions and activities will be +public and done in collaboration and discussion with the Project Contributors +and Community. The Core Team will have a private mailing list that will be used +sparingly and only when a specific matter requires privacy. When private +communications and decisions are needed, the Core Team will do its best to +summarize those to the Community after eliding personal/private/sensitive +information that should not be posted to the public internet. + +### Subcommittees + +The Core Team can create subcommittees that provide leadership and guidance for +specific aspects of the project. Like the Core Team as a whole, subcommittees +should conduct their business in an open and public manner unless privacy is +specifically called for. Private subcommittee communications should happen on +the main private mailing list of the Core Team unless specifically called for. + +Question: if the BDFL is not on a subcommittee, do they still have override +authority? + +Suggestion: they do, but they should appoint a delegate who plays that role +most of the time, and explicit BDFL intervention is sought only if the +committee disagrees with that delegate’s decision and no resolution is possible +within the team. This is different from a BDFL delegate for a specific decision +(or a recusal situation), where the BDFL is literally giving up his/her +authority to someone else in full. It’s more like what Linus Torvalds uses with his +“lieutenants” model. + +### NumFOCUS Subcommittee + +The Core Team will maintain one narrowly focused subcommittee to manage its +interactions with NumFOCUS. + +- The NumFOCUS Subcommittee is comprised of at least 5 persons who manage + project funding that comes through NumFOCUS. It is expected that these funds + will be spent in a manner that is consistent with the non-profit mission of + NumFOCUS and the direction of the Project as determined by the full Core + Team. +- This Subcommittee shall NOT make decisions about the direction, scope or + technical direction of the Project. +- This Subcommittee will have at least 5 members. No more than 2 Subcommittee + Members can report to one person (either directly or indirectly) through + employment or contracting work (including the reportee, i.e. the reportee + 1 + is the max). This avoids effective majorities resting on one person. + +## Institutional Partners and Funding + +The BDFL and Core Team are the primary leadership for the project. No outside +institution, individual or legal entity has the ability to own, control, usurp +or influence the project other than by participating in the Project as +Contributors and Core Team. However, because institutions are the primary +funding mechanism for the project, it is important to formally acknowledge +institutional participation in the project. These are Institutional Partners. + +An Institutional Contributor is any individual Project Contributor who +contributes to the project as part of their official duties at an Institutional +Partner. Likewise, an Institutional Core Team Member is any Core Team Member +who contributes to the project as part of their official duties at an +Institutional Partner. + +With these definitions, an Institutional Partner is any recognized legal entity +in the United States or elsewhere that employs at least one Institutional +Contributor or Institutional Core Team Member. Institutional Partners can be +for-profit or non-profit entities. + +Institutions become eligible to become an Institutional Partner by employing +individuals who actively contribute to The Project as part of their official +duties. To state this another way, the only way for an Institutional Partner to +influence the project is by actively contributing to the open development of +the project, on equal terms with any other member of the community of +Contributors and Core Team Members. Merely using pandas Software or Services in +an institutional context does not allow an entity to become an Institutional +Partner. Financial gifts do not enable an entity to become an Institutional +Partner. Once an institution becomes eligible for Institutional Partnership, +the Core Team must nominate and approve the Partnership. + +If an existing Institutional Partner no longer has a contributing employee, +they will be given a one-year grace period for other employees to begin +contributing. + +An Institutional Partner is free to pursue funding for their work on The +Project through any legal means. This could involve a non-profit organization +raising money from private foundations and donors or a for-profit company +building proprietary products and services that leverage Project Software and +Services. Funding acquired by Institutional Partners to work on The Project is +called Institutional Funding. However, no funding obtained by an Institutional +Partner can override The Project BDFL and Core Team. If a Partner has funding +to do pandas work and the Core Team decides to not pursue that work as a +project, the Partner is free to pursue it on their own. However in this +situation, that part of the Partner’s work will not be under the pandas +umbrella and cannot use the Project trademarks in a way that suggests a formal +relationship. + +To acknowledge institutional contributions, there are two levels of +Institutional Partners, with associated benefits: + +**Tier 1** = an institution with at least one Institutional Core Team Member + +- Acknowledged on the pandas website, in talks and T-shirts. +- Ability to acknowledge their own funding sources on the pandas website, in + talks and T-shirts. +- Ability to influence the project through the participation of their Core Team + Member. + +**Tier 2** = an institution with at least one Institutional Contributor + +## Breach + +Non-compliance with the terms of the governance documents shall be reported to +the Core Team either through public or private channels as deemed appropriate. + +## Changing the Governance + +Changes to the governance are submitted via a GitHub pull request to The Project's +[governance page](https://github.com/pandas-dev/pandas/blob/main/web/pandas/about/governance.md). +The pull request is then refined in response to public comment and review, with +the goal being consensus in the community. After this open period, a Core Team +Member proposes to the Core Team that the changes be ratified and the pull +request merged (accepting the proposed changes) or proposes that the pull +request be closed without merging (rejecting the proposed changes). The Member +should state the final commit hash in the pull request being proposed for +acceptance or rejection and briefly summarize the pull request. A minimum of +80% of the Core Team must vote and at least 2/3 of the votes must be positive +to carry out the proposed action (fractions of a vote rounded up to the nearest +integer). Since the BDFL holds ultimate authority in The Project, the BDFL has +authority to act alone in accepting or rejecting changes or overriding Core +Team decisions. diff --git a/web/pandas/content/pt/about/index.md b/web/pandas/content/pt/about/index.md new file mode 100644 index 0000000000000..4c809a148b328 --- /dev/null +++ b/web/pandas/content/pt/about/index.md @@ -0,0 +1,86 @@ +# About pandas + +## History of development + +In 2008, _pandas_ development began at [AQR Capital Management](https://www.aqr.com). +By the end of 2009 it had been [open sourced](https://en.wikipedia.org/wiki/Open_source), +and is actively supported today by a community of like-minded individuals around the world who +contribute their valuable time and energy to help make open source _pandas_ +possible. Thank you to [all of our contributors](team.html). + +Since 2015, _pandas_ is a [NumFOCUS sponsored project](https://numfocus.org/sponsored-projects). +This will help ensure the success of development of _pandas_ as a world-class open-source project. + +### Timeline + +- **2008**: Development of _pandas_ started +- **2009**: _pandas_ becomes open source +- **2012**: First edition of _Python for Data Analysis_ is published +- **2015**: _pandas_ becomes a [NumFOCUS sponsored project](https://numfocus.org/sponsored-projects) +- **2018**: First in-person core developer sprint + +## Library Highlights + +- A fast and efficient **DataFrame** object for data manipulation with + integrated indexing; + +- Tools for **reading and writing data** between in-memory data structures and + different formats: CSV and text files, Microsoft Excel, SQL databases, and + the fast HDF5 format; + +- Intelligent **data alignment** and integrated handling of **missing data**: + gain automatic label-based alignment in computations and easily manipulate + messy data into an orderly form; + +- Flexible **reshaping** and pivoting of data sets; + +- Intelligent label-based **slicing**, **fancy indexing**, and **subsetting** + of large data sets; + +- Columns can be inserted and deleted from data structures for **size + mutability**; + +- Aggregating or transforming data with a powerful **group by** engine + allowing split-apply-combine operations on data sets; + +- High performance **merging and joining** of data sets; + +- **Hierarchical axis indexing** provides an intuitive way of working with + high-dimensional data in a lower-dimensional data structure; + +- **Time series**-functionality: date range generation and frequency + conversion, moving window statistics, date shifting and lagging. + Even create domain-specific time offsets and join time + series without losing data; + +- Highly **optimized for performance**, with critical code paths written in + [Cython](https://cython.org) or C. + +- Python with *pandas* is in use in a wide variety of **academic and + commercial** domains, including Finance, Neuroscience, Economics, + Statistics, Advertising, Web Analytics, and more. + +## Mission + +_pandas_ aims to be the fundamental high-level building block for doing practical, +real world data analysis in Python. +Additionally, it has the broader goal of becoming the most powerful and flexible +open source data analysis / manipulation tool available in any language. + +## Vision + +A world where data analytics and manipulation software is: + +- Accessible to everyone +- Free for users to use and modify +- Flexible +- Powerful +- Easy to use +- Fast + +## Values + +Is in the core of _pandas_ to be respectful and welcoming with everybody, +users, contributors and the broader community. Regardless of level of experience, +gender, gender identity and expression, sexual orientation, disability, +personal appearance, body size, race, ethnicity, age, religion, or nationality. diff --git a/web/pandas/content/pt/about/roadmap.md b/web/pandas/content/pt/about/roadmap.md new file mode 100644 index 0000000000000..aba95ec2c03fc --- /dev/null +++ b/web/pandas/content/pt/about/roadmap.md @@ -0,0 +1,197 @@ +# Roadmap + +This page provides an overview of the major themes in pandas' +development. Each of these items requires a relatively large amount of +effort to implement. These may be achieved more quickly with dedicated +funding or interest from contributors. + +An item being on the roadmap does not mean that it will *necessarily* +happen, even with unlimited funding. During the implementation period we +may discover issues preventing the adoption of the feature. + +Additionally, an item *not* being on the roadmap does not exclude it +from inclusion in pandas. The roadmap is intended for larger, +fundamental changes to the project that are likely to take months or +years of developer time. Smaller-scoped items will continue to be +tracked on our [issue tracker](https://github.com/pandas-dev/pandas/issues). + +The roadmap is defined as a set of major enhancement proposals named PDEPs. +For more information about PDEPs, and how to submit one, please refer to +[PEDP-1]({{ base_url }}pdeps/0001-purpose-and-guidelines.html). + +## PDEPs + +{% for pdep_type in ["Under discussion", "Accepted", "Implemented", "Rejected"] %} + +

{{ pdep_type.replace("_", " ").capitalize() }}

+ + + +{% endfor %} + +## Roadmap points pending a PDEP + + + +### Extensibility + +Pandas `extending.extension-types` allow +for extending NumPy types with custom data types and array storage. +Pandas uses extension types internally, and provides an interface for +3rd-party libraries to define their own custom data types. + +Many parts of pandas still unintentionally convert data to a NumPy +array. These problems are especially pronounced for nested data. + +We'd like to improve the handling of extension arrays throughout the +library, making their behavior more consistent with the handling of +NumPy arrays. We'll do this by cleaning up pandas' internals and +adding new methods to the extension array interface. + +### String data type + +Currently, pandas stores text data in an `object` -dtype NumPy array. +The current implementation has two primary drawbacks: First, `object` +-dtype is not specific to strings: any Python object can be stored in an +`object` -dtype array, not just strings. Second: this is not efficient. +The NumPy memory model isn't especially well-suited to variable width +text data. + +To solve the first issue, we propose a new extension type for string +data. This will initially be opt-in, with users explicitly requesting +`dtype="string"`. The array backing this string dtype may initially be +the current implementation: an `object` -dtype NumPy array of Python +strings. + +To solve the second issue (performance), we'll explore alternative +in-memory array libraries (for example, Apache Arrow). As part of the +work, we may need to implement certain operations expected by pandas +users (for example the algorithm used in, `Series.str.upper`). That work +may be done outside of pandas. + +### Apache Arrow interoperability + +[Apache Arrow](https://arrow.apache.org) is a cross-language development +platform for in-memory data. The Arrow logical types are closely aligned +with typical pandas use cases. + +We'd like to provide better-integrated support for Arrow memory and +data types within pandas. This will let us take advantage of its I/O +capabilities and provide for better interoperability with other +languages and libraries using Arrow. + +### Decoupling of indexing and internals + +The code for getting and setting values in pandas' data structures +needs refactoring. In particular, we must clearly separate code that +converts keys (e.g., the argument to `DataFrame.loc`) to positions from +code that uses these positions to get or set values. This is related to +the proposed BlockManager rewrite. Currently, the BlockManager sometimes +uses label-based, rather than position-based, indexing. We propose that +it should only work with positional indexing, and the translation of +keys to positions should be entirely done at a higher level. + +Indexing is a complicated API with many subtleties. This refactor will require care +and attention. The following principles should inspire refactoring of indexing code and +should result on cleaner, simpler, and more performant code. + +1. Label indexing must never involve looking in an axis twice for the same label(s). +This implies that any validation step must either: + + * limit validation to general features (e.g. dtype/structure of the key/index), or + * reuse the result for the actual indexing. + +2. Indexers must never rely on an explicit call to other indexers. +For instance, it is OK to have some internal method of `.loc` call some +internal method of `__getitem__` (or of their common base class), +but never in the code flow of `.loc` should `the_obj[something]` appear. + +3. Execution of positional indexing must never involve labels (as currently, sadly, happens). +That is, the code flow of a getter call (or a setter call in which the right hand side is non-indexed) +to `.iloc` should never involve the axes of the object in any way. + +4. Indexing must never involve accessing/modifying values (i.e., act on `._data` or `.values`) more than once. +The following steps must hence be clearly decoupled: + + * find positions we need to access/modify on each axis + * (if we are accessing) derive the type of object we need to return (dimensionality) + * actually access/modify the values + * (if we are accessing) construct the return object + +5. As a corollary to the decoupling between 4.i and 4.iii, any code which deals on how data is stored +(including any combination of handling multiple dtypes, and sparse storage, categoricals, third-party types) +must be independent from code that deals with identifying affected rows/columns, +and take place only once step 4.i is completed. + + * In particular, such code should most probably not live in `pandas/core/indexing.py` + * ... and must not depend in any way on the type(s) of axes (e.g. no `MultiIndex` special cases) + +6. As a corollary to point 1.i, `Index` (sub)classes must provide separate methods for any desired validity check of label(s) which does not involve actual lookup, +on the one side, and for any required conversion/adaptation/lookup of label(s), on the other. + +7. Use of trial and error should be limited, and anyway restricted to catch only exceptions +which are actually expected (typically `KeyError`). + + * In particular, code should never (intentionally) raise new exceptions in the `except` portion of a `try... exception` + +8. Any code portion which is not specific to setters and getters must be shared, +and when small differences in behavior are expected (e.g. getting with `.loc` raises for +missing labels, setting still doesn't), they can be managed with a specific parameter. + +### Numba-accelerated operations + +[Numba](https://numba.pydata.org) is a JIT compiler for Python code. +We'd like to provide ways for users to apply their own Numba-jitted +functions where pandas accepts user-defined functions (for example, +`Series.apply`, +`DataFrame.apply`, +`DataFrame.applymap`, and in groupby and +window contexts). This will improve the performance of +user-defined-functions in these operations by staying within compiled +code. + +### Documentation improvements + +We'd like to improve the content, structure, and presentation of the +pandas documentation. Some specific goals include + +- Overhaul the HTML theme with a modern, responsive design + (`15556`) +- Improve the "Getting Started" documentation, designing and writing + learning paths for users different backgrounds (e.g. brand new to + programming, familiar with other languages like R, already familiar + with Python). +- Improve the overall organization of the documentation and specific + subsections of the documentation to make navigation and finding + content easier. + +### Performance monitoring + +Pandas uses [airspeed velocity](https://asv.readthedocs.io/en/stable/) +to monitor for performance regressions. ASV itself is a fabulous tool, +but requires some additional work to be integrated into an open source +project's workflow. + +The [asv-runner](https://github.com/asv-runner) organization, currently +made up of pandas maintainers, provides tools built on top of ASV. We +have a physical machine for running a number of project's benchmarks, +and tools managing the benchmark runs and reporting on results. + +We'd like to fund improvements and maintenance of these tools to + +- Be more stable. Currently, they're maintained on the nights and + weekends when a maintainer has free time. +- Tune the system for benchmarks to improve stability, following + +- Build a GitHub bot to request ASV runs *before* a PR is merged. + Currently, the benchmarks are only run nightly. diff --git a/web/pandas/content/pt/about/sponsors.md b/web/pandas/content/pt/about/sponsors.md new file mode 100644 index 0000000000000..4473a16cfd590 --- /dev/null +++ b/web/pandas/content/pt/about/sponsors.md @@ -0,0 +1,60 @@ +# Sponsors + +## NumFOCUS + +![](https://numfocus.org/wp-content/uploads/2018/01/optNumFocus_LRG.png) + +_pandas_ is a Sponsored Project of [NumFOCUS](https://numfocus.org/), a 501(c)(3) nonprofit charity in the United States. +NumFOCUS provides _pandas_ with fiscal, legal, and administrative support to help ensure the +health and sustainability of the project. Visit numfocus.org for more information. + +Donations to _pandas_ are managed by NumFOCUS. For donors in the United States, your gift is tax-deductible +to the extent provided by law. As with any donation, you should consult with your tax adviser about your particular tax situation. + +## Become a sponsor + +As a free and open source project, _pandas_ relies on the support of the community of users for its development. +If you work for an organization that uses and benefits from _pandas_, please consider supporting pandas. There +are different ways, such as employing people to work on pandas, funding the project, or becoming a +[NumFOCUS sponsor](https://numfocus.org/sponsors) to support the broader ecosystem. Please contact us at +[admin@numfocus.org](mailto:admin@numfocus.org) to discuss. + +## Institutional partners + +Institutional partners are companies and universities that support the project by employing contributors. +Current institutional partners include: + + + +## Sponsors + +Sponsors are organizations that provide funding for pandas. Current sponsors include: + + + +## In-kind sponsors + +In-kind sponsors are organizations that support pandas development with goods or services. +Current in-kind sponsors include: + + + +## Past institutional partners + + diff --git a/web/pandas/content/pt/about/team.md b/web/pandas/content/pt/about/team.md new file mode 100644 index 0000000000000..7a19fd7af6595 --- /dev/null +++ b/web/pandas/content/pt/about/team.md @@ -0,0 +1,85 @@ +# Team + +## Contributors + +_pandas_ is made with love by more than [2,000 volunteer contributors](https://github.com/pandas-dev/pandas/graphs/contributors). + +If you want to support pandas development, you can find information in the [donations page]({{ base_url }}donate.html). + +## Active maintainers + +
+ {% for username in maintainers.active %} + {% set person = maintainers.github_info.get(username) %} +
+ +
+
+ {% if person.blog %} + + {{ person.name or person.login }} + + {% else %} + {{ person.name or person.login }} + {% endif %} +
+

{{ person.login }}

+
+
+ {% endfor %} +
+ +## Diversity and Inclusion + +> _pandas_ expressly welcomes and encourages contributions from anyone who faces under-representation, discrimination in the technology industry +> or anyone willing to increase the diversity of our team. +> We have identified visible gaps and obstacles in sustaining diversity and inclusion in the open-source communities and we are proactive in increasing +> the diversity of our team. +> We have a [code of conduct]({{ base_url }}community/coc.html) to ensure a friendly and welcoming environment. +> Please send an email to [pandas-code-of-conduct-committee](mailto:pandas-coc@googlegroups.com), if you think we can do a +> better job at achieving this goal. + +## Governance + +The project governance is available in the [project governance page]({{ base_url }}about/governance.html). + +## Workgroups + +{% for k, workgroup in workgroups.items() %} + +### {{ workgroup.name }} + + + +{% endfor %} + +## Inactive maintainers + + diff --git a/web/pandas/content/pt/community/benchmarks.md b/web/pandas/content/pt/community/benchmarks.md new file mode 100644 index 0000000000000..1e63832a5a2ba --- /dev/null +++ b/web/pandas/content/pt/community/benchmarks.md @@ -0,0 +1,79 @@ +# Benchmarks + +Benchmarks are tests to measure the performance of pandas. There are two different +kinds of benchmarks relevant to pandas: + +* Internal pandas benchmarks to measure speed and memory usage over time +* Community benchmarks comparing the speed or memory usage of different tools at + doing the same job + +## pandas benchmarks + +pandas benchmarks are implemented in the [asv_bench](https://github.com/pandas-dev/pandas/tree/main/asv_bench) +directory of our repository. The benchmarks are implemented for the +[airspeed velocity](https://asv.readthedocs.io/en/v0.6.1/) (asv for short) framework. + +The benchmarks can be run locally by any pandas developer. This can be done +with the `asv run` command, and it can be useful to detect if local changes have +an impact in performance, by running the benchmarks before and after the changes. +More information on running the performance test suite is found +[here](https://pandas.pydata.org/docs/dev/development/contributing_codebase.html#running-the-performance-test-suite). + +Note that benchmarks are not deterministic, and running in different hardware or +running in the same hardware with different levels of stress have a big impact in +the result. Even running the benchmarks with identical hardware and almost identical +conditions produces significant differences when running the same exact code. + +## pandas benchmarks servers + +We currently have two physical servers running the benchmarks of pandas for every +(or almost every) commit to the `main` branch. The servers run independently from +each other. The original server has been running for a long time, and it is physically +located with one of the pandas maintainers. The newer server is in a datacenter +kindly sponsored by [OVHCloud](https://www.ovhcloud.com/). More information about +pandas sponsors, and how your company can support the development of pandas is +available at the [pandas sponsors]({{ base_url }}about/sponsors.html) page. + +Results of the benchmarks are available at: + +- Original server: [asv](https://asv-runner.github.io/asv-collection/pandas/) +- OVH server: [asv](https://pandas.pydata.org/benchmarks/asv/) (benchmarks results can + also be visualized in this [Conbench PoC](http://57.128.112.95:5000/) + +### Original server configuration + +The machine can be configured with the Ansible playbook in +[tomaugspurger/asv-runner](https://github.com/tomaugspurger/asv-runner). +The results are published to another GitHub repository, +[tomaugspurger/asv-collection](https://github.com/tomaugspurger/asv-collection). + +The benchmarks are scheduled by [Airflow](https://airflow.apache.org/). +It has a dashboard for viewing and debugging the results. +You’ll need to setup an SSH tunnel to view them: + +``` +ssh -L 8080:localhost:8080 pandas@panda.likescandy.com +``` + +### OVH server configuration + +The server used to run the benchmarks has been configured to reduce system +noise and maximize the stability of the benchmarks times. + +The details on how the server is configured can be found in the +[pandas-benchmarks repository](https://github.com/pandas-dev/pandas-benchmarks). +There is a quick summary here: + +- CPU isolation: Avoid user space tasks to execute in the same CPU as benchmarks, possibly interrupting them during the execution (include all virtual CPUs using a physical core) +- NoHZ: Stop the kernel tick that enables context switching in the isolated CPU +- IRQ affinity: Ban benchmarks CPU to avoid many (but not all) kernel interruption in the isolated CPU +- TurboBoost: Disable CPU scaling based on high CPU demand +- P-States: Use "performance" governor to disable P-States and CPU frequency changes based on them +- C-States: Set C-State to 0 and disable changes to avoid slower CPU after system inactivity + +## Community benchmarks + +The main benchmarks comparing dataframe tools that include pandas are: + +- [DuckDB (former H2O.ai) benchmarks](https://duckdblabs.github.io/db-benchmark/) +- [TPCH benchmarks](https://pola.rs/posts/benchmarks/) diff --git a/web/pandas/content/pt/community/blog/2019-user-survey.md b/web/pandas/content/pt/community/blog/2019-user-survey.md new file mode 100644 index 0000000000000..821fdd01acf65 --- /dev/null +++ b/web/pandas/content/pt/community/blog/2019-user-survey.md @@ -0,0 +1,172 @@ +Title: 2019 pandas user survey +Date: 2019-08-22 + + + +# 2019 pandas user survey + +Pandas recently conducted a user survey to help guide future development. +Thanks to everyone who participated! This post presents the high-level results. + +This analysis and the raw data can be found [on GitHub](https://github.com/pandas-dev/pandas-user-surveys) and run on Binder + +[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/pandas-dev/pandas-user-surveys/master?filepath=2019.ipynb) + + +We had about 1250 responses over the 15 days we ran the survey in the summer of 2019. + +## About the Respondents + +There was a fair amount of representation across pandas experience and frequency of use, though the majority of respondents are on the more experienced side. + + + +![png]({{ base_url }}/static/img/blog/2019-user-survey/2019_4_0.png) + + + + +![png]({{ base_url }}/static/img/blog/2019-user-survey/2019_5_0.png) + + +We included a few questions that were also asked in the [Python Developers Survey](https://www.jetbrains.com/research/python-developers-survey-2018/) so we could compare Pandas' population to Python's. + +90% of our respondents use Python as a primary language (compared with 84% from the PSF survey). + + + + + + Yes 90.67% + No 9.33% + Name: Is Python your main language?, dtype: object + + + +Windows users are well represented (see [Steve Dower's talk](https://www.youtube.com/watch?v=uoI57uMdDD4) on this topic). + + + + + + Linux 61.57% + Windows 60.21% + MacOS 42.75% + Name: What Operating Systems do you use?, dtype: object + + + +For environment isolation, [conda](https://conda.io/en/latest/) was the most popular. + + + + +![png]({{ base_url }}/static/img/blog/2019-user-survey/2019_13_0.png) + + +Most respondents are Python 3 only. + + + + + + 3 92.39% + 2 & 3 6.80% + 2 0.81% + Name: Python 2 or 3?, dtype: object + + + +## Pandas APIs + +It can be hard for open source projects to know what features are actually used. We asked a few questions to get an idea. + +CSV and Excel are (for better or worse) the most popular formats. + + + +![png]({{ base_url }}/static/img/blog/2019-user-survey/2019_18_0.png) + + +In preparation for a possible refactor of pandas internals, we wanted to get a sense for +how common wide (100s of columns or more) DataFrames are. + + + +![png]({{ base_url }}/static/img/blog/2019-user-survey/2019_20_0.png) + + +Pandas is slowly growing new extension types. Categoricals are the most popular, +and the nullable integer type is already almost as popular as datetime with timezone. + + + +![png]({{ base_url }}/static/img/blog/2019-user-survey/2019_22_0.png) + + +More and better examples seem to be a high-priority development item. +Pandas recently received a NumFOCUS grant to improve our documentation, +which we're using to write tutorial-style documentation, which should help +meet this need. + + + +![png]({{ base_url }}/static/img/blog/2019-user-survey/2019_24_0.png) + + +We also asked about specific, commonly-requested features. + + + +![png]({{ base_url }}/static/img/blog/2019-user-survey/2019_26_0.png) + + +Of these, the clear standout is "scaling" to large datasets. A couple observations: + +1. Perhaps pandas' documentation should do a better job of promoting libraries that provide scalable dataframes (like [Dask](https://dask.org), [vaex](https://dask.org), and [modin](https://modin.readthedocs.io/en/latest/)) +2. Memory efficiency (perhaps from a native string data type, fewer internal copies, etc.) is a valuable goal. + +After that, the next-most critical improvement is integer missing values. Those were actually added in [Pandas 0.24](https://pandas.pydata.org/pandas-docs/stable/whatsnew/v0.24.0.html#optional-integer-na-support), but they're not the default, and there's still some incompatibilities with the rest of pandas API. + +Pandas is a less conservative library than, say, NumPy. We're approaching 1.0, but on the way we've made many deprecations and some outright API breaking changes. Fortunately, most people are OK with the tradeoff. + + + + + + Yes 94.89% + No 5.11% + Name: Is Pandas stable enough for you?, dtype: object + + + +There's a perception (which is shared by many of the pandas maintainers) that the pandas API is too large. To measure that, we asked whether users thought that pandas' API was too large, too small, or just right. + + + +![png]({{ base_url }}/static/img/blog/2019-user-survey/2019_31_0.png) + + +Finally, we asked for an overall satisfaction with the library, from 1 (not very unsatisfied) to 5 (very satisfied). + + + +![png]({{ base_url }}/static/img/blog/2019-user-survey/2019_33_0.png) + + +Most people are very satisfied. The average response is 4.39. I look forward to tracking this number over time. + +If you're analyzing the raw data, be sure to share the results with us [@pandas_dev](https://twitter.com/pandas_dev). diff --git a/web/pandas/content/pt/community/blog/asv-pandas-grant.md b/web/pandas/content/pt/community/blog/asv-pandas-grant.md new file mode 100644 index 0000000000000..205c2222fec94 --- /dev/null +++ b/web/pandas/content/pt/community/blog/asv-pandas-grant.md @@ -0,0 +1,141 @@ +Title: Write up of the NumFOCUS grant to improve pandas benchmarks and diversity +Date: 2022-04-01 + +# Write up of the NumFOCUS grant to improve pandas benchmarks and diversity + +*By Lucy Jiménez and Dorothy Kabarozi B.* + +We want to share our experience working on **Improvements to the** +**ASV benchmarking framework and diversity efforts** sponsored by +[NumFOCUS](https://numfocus.org/) to the [pandas](https://pandas.pydata.org/) +project. + +This grant focused on two aspects: the first one is to improve the +[asv library](https://asv.readthedocs.io/en/stable/), a tool used by +benchmarking Python packages and used by pandas; this project was +unmaintained, and the codebase was quite old; additionally, it didn't +adhere to modern standards, had Python 2 compatibility code that could +be removed, and also the CI could be improved. The second aspect is +encouraging more underrepresented groups to contribute to open source +projects. This grant was held over 10 weeks, working around 20 hours a +week. It was developed by Dorothy Kabarozi B. from Uganda and Lucy +Jiménez from Colombia, under the mentoring of Marc Garcia. + +## Why were we part of the grant? + +Even when we come from different backgrounds, Dorothy from systems +engineering and Lucy from computational chemistry, we have always been +interested in participating and contributing to open source software +projects. For that reason, we have been running the PyLadies meetups in +our communities ([PyLadies Kampala](https://twitter.com/pyladieskla), +[PyLadies Colombia](https://twitter.com/pyladies_co)) and have always +been on the lookout for any opportunities that lead us to contribute. + +It all happened through Marc Garcia; he had put out a call ​through a post +on social media to mentor ladies from diverse backgrounds. Dorothy got to +be part of the pandas mentorship group. At the same time, Lucy was +co-organizer of the SciPy Latam conference, and it is from here she met +Marc, who was the speaker at that conference, and through this mutual +connection, we were able to learn about this benchmarks grant. + +In brief, by attending conferences, meetups, and social media, you can +make connections and links that will lead you to these opportunities. + +## Learning from the source code + +At the beginning of the grant, we started from the basics. We noticed that +we could improve our skills in managing Git and GitHub. For example, we had +some troubles with the git workflow, so we had to read and practice more +about it. One of the valuable resources was the explanation from Marc about +[how to make an open source contribution](https://tubedu.org/w/kjnHEg72j76StmSFmjzbnE), +which we invite you to take a look at it. + +We learned a lot from the source code and gained immense knowledge about +best practices and code quality through this grant. We have been working +on: updating the code to improve the style to follow the PEP-8 guidelines, +removing Python 2 compatibility code and six dependencies, and finding +unused code and removing it. We also learned about GitHub actions, and we +started building the CI on GitHub actions for the asv package; for that we +have been working on add linting with Flake8, testing with pytest, building +docs, and running CI on different python versions. + +Additionally, we were able to identify bugs in the source code, review +pull request from other contributors, and create new issues, something we +thought only maintainers could do but not contributors. Finally, not only +is reviewing the code itself a learning experience, but also the structure +and folder hierarchy in the project started to be more transparent. + +## Our experience + +For this grant, we had a fantastic Mentor, Marc Garcia. He was always +willing to share his knowledge, explain unclear concepts and share helpful +feedback. Whenever we would implement that feedback, it felt easier to work +on more issues faster. We felt the growth from the time we started on this +project, and we will carry it along as we contribute to more open source +projects; this all goes back to Marc for his fantastic mentorship. It is +also important to note that we received feedback from other contributors, +stakeholders, and core devs during this process, which gave us a broader +look at the work in open source projects. + +We also built a strong teamwork partnership. We helped each other a lot as +we had numerous one-on-one calls to understand the tasks better. We always +looked for ways to support each other from the technical side and encouraged +each other when needed. For us, it was professional and human growth. + +## Running an open source software sprint + +The knowledge and experience acquired in this process allowed us to +organize two virtual sprints. The events were carried out in the company +of local PyLadies communities; the first one was on February 26th with +[PyLadies Kampala](https://twitter.com/pyladieskla) and on March 21 +with [PyLadies Colombia](https://bit.ly/sprint-asv). + +While organizing these events, we learned how to organize and conduct a +virtual sprint. Some participants in the sprint ultimately had no idea +about open source, and it was great explaining open source concepts and +taking them through the Git workflow. Finally, they were able to make their +first contribution. We learned how to follow up on contributors, helping +them along the way until their PRs were merged and by reviewing their +contributions on GitHub. + +The most outstanding achievement was mentoring new contributors and +sharing the knowledge acquired from this grant with others participants +in our respective communities. Most new contributors after the experience +have gone ahead to apply for outreach and the upcoming +[Google Summer of Code](https://summerofcode.withgoogle.com/) +to apply the skills they learned from these sprints. + +## Conclusion + +In conclusion, we learned a lot from this experience from the code part, +the workflow on the open source projects, how to be resilient in difficult +moments, and encouraging more women and people from our local communities +to contribute to open source projects. + +Finally, if you want to be part of an open source project, we invite you +to check out GitHub repos for different projects you are interested in and +search for the easy issues to work on and get started. Also, you can contact +the maintainers of the projects with specific questions, search for the +open source communities in your country or contact us for more help. + +## Acknowledgments + +Many thanks to [NumFOCUS](https://numfocus.org/) for giving us this support +through [Small Development Grants](https://numfocus.org/programs/small-development-grants) +and Marc for the excellent mentoring he generously gave us throughout these +weeks. + +We are looking forward to contributing more and impacting our communities +and the open source community! + +___ +If you want to know more, please don't hesitate to connect with us through +these channels: + +*Lucy Jiménez* +* [Twitter](https://twitter.com/JimenezLucyJ) +* [LinkedIn](https://www.linkedin.com/in/lucy-j/) + +*Dorothy Kabarozi* +* [Twitter](https://twitter.com/kizdorothy) +* [LinkedIn](https://www.linkedin.com/in/dorothy-kabarozi/) diff --git a/web/pandas/content/pt/community/blog/extension-arrays.md b/web/pandas/content/pt/community/blog/extension-arrays.md new file mode 100644 index 0000000000000..80a187bb3fc3c --- /dev/null +++ b/web/pandas/content/pt/community/blog/extension-arrays.md @@ -0,0 +1,218 @@ +Title: pandas extension arrays +Date: 2019-01-04 + +# pandas extension arrays + +Extensibility was a major theme in pandas development over the last couple of +releases. This post introduces the pandas extension array interface: the +motivation behind it and how it might affect you as a pandas user. Finally, we +look at how extension arrays may shape the future of pandas. + +Extension Arrays are just one of the changes in pandas 0.24.0. See the +[whatsnew][whatsnew] for a full changelog. + +## The Motivation + +Pandas is built on top of NumPy. You could roughly define a Series as a wrapper +around a NumPy array, and a DataFrame as a collection of Series with a shared +index. That's not entirely correct for several reasons, but I want to focus on +the "wrapper around a NumPy array" part. It'd be more correct to say "wrapper +around an array-like object". + +Pandas mostly uses NumPy's builtin data representation; we've restricted it in +places and extended it in others. For example, pandas' early users cared greatly +about timezone-aware datetimes, which NumPy doesn't support. So pandas +internally defined a `DatetimeTZ` dtype (which mimics a NumPy dtype), and +allowed you to use that dtype in `Index`, `Series`, and as a column in a +`DataFrame`. That dtype carried around the tzinfo, but wasn't itself a valid +NumPy dtype. + +As another example, consider `Categorical`. This actually composes *two* arrays: +one for the `categories` and one for the `codes`. But it can be stored in a +`DataFrame` like any other column. + +Each of these extension types pandas added is useful on its own, but carries a +high maintenance cost. Large sections of the codebase need to be aware of how to +handle a NumPy array or one of these other kinds of special arrays. This made +adding new extension types to pandas very difficult. + +Anaconda, Inc. had a client who regularly dealt with datasets with IP addresses. +They wondered if it made sense to add an [IPArray][IPArray] to pandas. In the +end, we didn't think it passed the cost-benefit test for inclusion in pandas +*itself*, but we were interested in defining an interface for third-party +extensions to pandas. Any object implementing this interface would be allowed in +pandas. I was able to write [cyberpandas][cyberpandas] outside of pandas, but it +feels like using any other dtype built into pandas. + +## The Current State + +As of pandas 0.24.0, all of pandas' internal extension arrays (Categorical, +Datetime with Timezone, Period, Interval, and Sparse) are now built on top of +the ExtensionArray interface. Users shouldn't notice many changes. The main +thing you'll notice is that things are cast to `object` dtype in fewer places, +meaning your code will run faster and your types will be more stable. This +includes storing `Period` and `Interval` data in `Series` (which were previously +cast to object dtype). + +Additionally, we'll be able to add *new* extension arrays with relative ease. +For example, 0.24.0 (optionally) solved one of pandas longest-standing pain +points: missing values casting integer-dtype values to float. + + +```python +>>> int_ser = pd.Series([1, 2], index=[0, 2]) +>>> int_ser +0 1 +2 2 +dtype: int64 + +>>> int_ser.reindex([0, 1, 2]) +0 1.0 +1 NaN +2 2.0 +dtype: float64 +``` + +With the new [IntegerArray][IntegerArray] and nullable integer dtypes, we can +natively represent integer data with missing values. + +```python +>>> int_ser = pd.Series([1, 2], index=[0, 2], dtype=pd.Int64Dtype()) +>>> int_ser +0 1 +2 2 +dtype: Int64 + +>>> int_ser.reindex([0, 1, 2]) +0 1 +1 NaN +2 2 +dtype: Int64 +``` + +One thing it does slightly change how you should access the raw (unlabeled) +arrays stored inside a Series or Index, which is occasionally useful. Perhaps +the method you're calling only works with NumPy arrays, or perhaps you want to +disable automatic alignment. + +In the past, you'd hear things like "Use `.values` to extract the NumPy array +from a Series or DataFrame." If it were a good resource, they'd tell you that's +not *entirely* true, since there are some exceptions. I'd like to delve into +those exceptions. + +The fundamental problem with `.values` is that it serves two purposes: + +1. Extracting the array backing a Series, Index, or DataFrame +2. Converting the Series, Index, or DataFrame to a NumPy array + +As we saw above, the "array" backing a Series or Index might not be a NumPy +array, it may instead be an extension array (from pandas or a third-party +library). For example, consider `Categorical`, + +```python +>>> cat = pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c']) +>>> ser = pd.Series(cat) +>>> ser +0 a +1 b +2 a +dtype: category +Categories (3, object): ['a', 'b', 'c'] + +>>> ser.values +[a, b, a] +Categories (3, object): ['a', 'b', 'c'] +``` + +In this case `.values` is a Categorical, not a NumPy array. For period-dtype +data, `.values` returns a NumPy array of `Period` objects, which is expensive to +create. For timezone-aware data, `.values` converts to UTC and *drops* the +timezone info. These kind of surprises (different types, or expensive or lossy +conversions) stem from trying to shoehorn these extension arrays into a NumPy +array. But the entire point of an extension array is for representing data NumPy +*can't* natively represent. + +To solve the `.values` problem, we've split its roles into two dedicated methods: + +1. Use `.array` to get a zero-copy reference to the underlying data +2. Use `.to_numpy()` to get a (potentially expensive, lossy) NumPy array of the + data. + +So with our Categorical example, + +```python +>>> ser.array +[a, b, a] +Categories (3, object): ['a', 'b', 'c'] + +>>> ser.to_numpy() +array(['a', 'b', 'a'], dtype=object) +``` + +To summarize: + +- `.array` will *always* be a an ExtensionArray, and is always a zero-copy + reference back to the data. +- `.to_numpy()` is *always* a NumPy array, so you can reliably call + ndarray-specific methods on it. + +You shouldn't ever need `.values` anymore. + +## Possible Future Paths + +Extension Arrays open up quite a few exciting opportunities. Currently, pandas +represents string data using Python objects in a NumPy array, which is slow. +Libraries like [Apache Arrow][arrow] provide native support for variable-length +strings, and the [Fletcher][fletcher] library provides pandas extension arrays +for Arrow arrays. It will allow [GeoPandas][geopandas] to store geometry data +more efficiently. Pandas (or third-party libraries) will be able to support +nested data, data with units, geo data, GPU arrays. Keep an eye on the +[pandas ecosystem][eco] page, which will keep track of third-party extension +arrays. It's an exciting time for pandas development. + +## Other Thoughts + +I'd like to emphasize that this is an *interface*, and not a concrete array +implementation. We are *not* reimplementing NumPy here in pandas. Rather, this +is a way to take any array-like data structure (one or more NumPy arrays, an +Apache Arrow array, a CuPy array) and place it inside a DataFrame. I think +getting pandas out of the array business, and instead thinking about +higher-level tabular data things, is a healthy development for the project. + +This works perfectly with NumPy's [`__array_ufunc__`][ufunc] protocol and +[NEP-18][nep18]. You'll be able to use the familiar NumPy API on objects that +aren't backed by NumPy memory. + +## Upgrade + +These new goodies are all available in the recently released pandas 0.24. + +conda: + + conda install -c conda-forge pandas + +pip: + + pip install --upgrade pandas + +As always, we're happy to hear feedback on the [mailing list][ml], +[@pandas-dev][twitter], or [issue tracker][tracker]. + +Thanks to the many contributors, maintainers, and [institutional +partners][partners] involved in the pandas community. + + +[IPArray]: https://github.com/pandas-dev/pandas/issues/18767 +[cyberpandas]: https://cyberpandas.readthedocs.io +[IntegerArray]: http://pandas.pydata.org/pandas-docs/version/0.24/reference/api/pandas.arrays.IntegerArray.html +[fletcher]: https://github.com/xhochy/fletcher +[arrow]: https://arrow.apache.org +[ufunc]: https://numpy.org/neps/nep-0013-ufunc-overrides.html +[nep18]: https://www.numpy.org/neps/nep-0018-array-function-protocol.html +[ml]: https://mail.python.org/mailman/listinfo/pandas-dev +[twitter]: https://twitter.com/pandas_dev +[tracker]: https://github.com/pandas-dev/pandas/issues +[partners]: https://pandas.pydata.org/about/sponsors.html +[eco]: http://pandas.pydata.org/pandas-docs/stable/ecosystem.html#extension-data-types +[whatsnew]: http://pandas.pydata.org/pandas-docs/version/0.24/whatsnew/v0.24.0.html +[geopandas]: https://github.com/geopandas/geopandas diff --git a/web/pandas/content/pt/community/blog/index.html b/web/pandas/content/pt/community/blog/index.html new file mode 100644 index 0000000000000..6f0bee2e11407 --- /dev/null +++ b/web/pandas/content/pt/community/blog/index.html @@ -0,0 +1,14 @@ +{% extends "layout.html" %} + +{% block body %} + {% for post in blog.posts %} +
+
+
{{ post.title }}
+
Source: {{ post.feed }} | Author: {{ post.author }} | Published: {{ post.published.strftime("%b %d, %Y") }}
+
{{ post.summary }}
+ Read more +
+
+ {% endfor %} +{% endblock %} diff --git a/web/pandas/content/pt/community/blog/pandas-1.0.md b/web/pandas/content/pt/community/blog/pandas-1.0.md new file mode 100644 index 0000000000000..d190ed6e897b3 --- /dev/null +++ b/web/pandas/content/pt/community/blog/pandas-1.0.md @@ -0,0 +1,31 @@ +Title: pandas 1.0 +Date: 2020-01-29 + +# pandas 1.0 + +Today pandas celebrates its 1.0.0 release. In many ways this is just a normal release with a host of new features, performance improvements, and bug fixes, which are documented in our [release notes](https://pandas.pydata.org/pandas-docs/version/1.0.0/whatsnew/v1.0.0.html). But it’s also something a bit more — a milestone for the project beyond just the commits. We wanted to take some time to reflect on where we've been and where we're going. + +## Reflections + +The world of scientific Python has changed a lot since pandas was started. In 2011, [the ecosystem was fragmented](https://wesmckinney.com/blog/a-roadmap-for-rich-scientific-data-structures-in-python/): a standard *rich* data structure for statistics and data science had yet to emerge. This echos a similar story for NumPy, which consolidated array efforts that were [previously fragmented](https://numpy.org/old_array_packages.html). + +Over the subsequent years, pandas emerged as a *de facto* standard. It’s used by data scientists and analysts and as a data structure for other libraries to build on top of. StackOverflow [cited pandas](https://stackoverflow.blog/2017/09/14/python-growing-quickly/) as one of the reasons for Python being the fastest growing major programming language. + +![Growth of pandas](https://149351115.v2.pressablecdn.com/wp-content/uploads/2017/09/related_tags_over_time-1-1000x1000.png) + +Today, the ecosystem is in another phase of exploration. +Several new DataFrame implementations are cropping up to fill needs not met by pandas. +We're [working with those projects](https://datapythonista.me/blog/dataframe-summit-at-euroscipy.html) to establish shared standards and semantics for rich data structures. + +## Community and Project Health + +This release cycle is the first to involve any kind of grant funding for pandas. [Pandas received funding](https://chanzuckerberg.com/eoss/proposals/) as part of the CZI’s [*Essential Open Source Software for Science*](https://medium.com/@cziscience/the-invisible-foundations-of-biomedicine-4ab7f8d4f5dd) [program](https://medium.com/@cziscience/the-invisible-foundations-of-biomedicine-4ab7f8d4f5dd). The pandas project relies overwhelmingly on volunteer contributors. These volunteer contributions are shepherded and augmented by some maintainers who are given time from their employers — our [institutional partners](../about/sponsors.html). The largest work item in our grant award was library maintenance, which specifically includes working with community members to address our large backlog of open issues and pull requests. + +While a “1.0.0” version might seem arbitrary or anti-climactic (given that pandas as a codebase is nearly 12 years old), we see it as a symbolic milestone celebrating the growth of our core developer team and depth of our contributor base. Few open source projects are ever truly “done” and pandas is no different. We recognize the essential role that pandas now occupies, and we intend to continue to evolve the project and adapt to the needs of the world’s data wranglers. + +## Going Forward + +Our [roadmap](https://pandas.pydata.org/pandas-docs/version/1.0.0/development/roadmap.html) contains an up-to-date listing of where we see the project heading over the next few years. +Needless to say, there's still plenty to do. + +Check out the [release notes](https://pandas.pydata.org/pandas-docs/version/1.0.0/whatsnew/v1.0.0.html) and visit the [installation page](https://pandas.pydata.org/pandas-docs/version/1.0.0/getting_started/install.html) for instructions on updating to pandas 1.0. diff --git a/web/pandas/content/pt/community/coc.md b/web/pandas/content/pt/community/coc.md new file mode 100644 index 0000000000000..22cd77859c557 --- /dev/null +++ b/web/pandas/content/pt/community/coc.md @@ -0,0 +1,65 @@ +# Code of conduct + +As contributors and maintainers of this project, and in the interest of +fostering an open and welcoming community, we pledge to respect all people who +contribute through reporting issues, posting feature requests, updating +documentation, submitting pull requests or patches, and other activities. + +We are committed to making participation in this project a harassment-free +experience for everyone, regardless of level of experience, gender, gender +identity and expression, sexual orientation, disability, personal appearance, +body size, race, ethnicity, age, religion, or nationality. + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery +* Personal attacks +* Trolling or insulting/derogatory comments +* Public or private harassment +* Publishing other's private information, such as physical or electronic + addresses, without explicit permission +* Other unethical or unprofessional conduct + +Furthermore, we encourage inclusive behavior - for example, +please don't say “hey guys!” but “hey everyone!”. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +By adopting this Code of Conduct, project maintainers commit themselves to +fairly and consistently applying these principles to every aspect of managing +this project. Project maintainers who do not follow or enforce the Code of +Conduct may be permanently removed from the project team. + +This Code of Conduct applies both within project spaces and in public spaces +when an individual is representing the project or its community. + +A working group of community members is committed to promptly addressing any +reported issues. The working group is made up of pandas contributors and users. +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the working group by e-mail (pandas-coc@googlegroups.com). +Messages sent to this e-mail address will not be publicly visible but only to +the working group members. The working group currently includes + + + +All complaints will be reviewed and investigated and will result in a response +that is deemed necessary and appropriate to the circumstances. Maintainers are +obligated to maintain confidentiality with regard to the reporter of an +incident. + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 1.3.0, available at +[https://www.contributor-covenant.org/version/1/3/0/][version], +and the [Swift Code of Conduct][swift]. + +[homepage]: https://www.contributor-covenant.org +[version]: https://www.contributor-covenant.org/version/1/3/0/ +[swift]: https://swift.org/community/#code-of-conduct diff --git a/web/pandas/content/pt/community/ecosystem.md b/web/pandas/content/pt/community/ecosystem.md new file mode 100644 index 0000000000000..74b7c1f4884a1 --- /dev/null +++ b/web/pandas/content/pt/community/ecosystem.md @@ -0,0 +1,745 @@ +# Ecosystem + +Increasingly, packages are being built on top of pandas to address +specific needs in data preparation, analysis and visualization. This is +encouraging because it means pandas is not only helping users to handle +their data tasks but also that it provides a better starting point for +developers to build powerful and more focused data tools. The creation +of libraries that complement pandas' functionality also allows pandas +development to remain focused around its original requirements. + +This is a community-maintained list of projects that build on pandas in order +to provide tools in the PyData space. The pandas core development team does not necessarily endorse any particular project on this list or have any knowledge of the maintenance status of any particular library. + +For a more complete list of projects that depend on pandas, see the [libraries.io usage page for +pandas](https://libraries.io/pypi/pandas/usage) or [search pypi for +pandas](https://pypi.org/search/?q=pandas). + +We'd like to make it easier for users to find these projects, if you +know of other substantial projects that you feel should be on this list, +please let us know. + +## Statistics and machine learning + +### [Statsmodels](https://www.statsmodels.org/) + +Statsmodels is the prominent Python "statistics and econometrics +library" and it has a long-standing special relationship with pandas. +Statsmodels provides powerful statistics, econometrics, analysis and +modeling functionality that is out of pandas' scope. Statsmodels +leverages pandas objects as the underlying data container for +computation. + +### [skrub](https://skrub-data.org) + +Skrub facilitates machine learning on dataframes. It bridges pandas +to scikit-learn and related. In particular it facilitates building +features from dataframes. + +### [Featuretools](https://github.com/alteryx/featuretools/) + +Featuretools is a Python library for automated feature engineering built +on top of pandas. It excels at transforming temporal and relational +datasets into feature matrices for machine learning using reusable +feature engineering "primitives". Users can contribute their own +primitives in Python and share them with the rest of the community. + +### [Compose](https://github.com/alteryx/compose) + +Compose is a machine learning tool for labeling data and prediction engineering. +It allows you to structure the labeling process by parameterizing +prediction problems and transforming time-driven relational data into +target values with cutoff times that can be used for supervised learning. + +### [STUMPY](https://github.com/TDAmeritrade/stumpy) + +STUMPY is a powerful and scalable Python library for modern time series analysis. +At its core, STUMPY efficiently computes something called a +[matrix profile](https://stumpy.readthedocs.io/en/latest/Tutorial_The_Matrix_Profile.html), +which can be used for a wide variety of time series data mining tasks. + +## Visualization + +### [Altair](https://altair-viz.github.io/) + +Altair is a declarative statistical visualization library for Python. +With Altair, you can spend more time understanding your data and its +meaning. Altair's API is simple, friendly and consistent and built on +top of the powerful Vega-Lite JSON specification. This elegant +simplicity produces beautiful and effective visualizations with a +minimal amount of code. Altair works with Pandas DataFrames. + +### [Bokeh](https://docs.bokeh.org) + +Bokeh is a Python interactive visualization library for large datasets +that natively uses the latest web technologies. Its goal is to provide +elegant, concise construction of novel graphics in the style of +Protovis/D3, while delivering high-performance interactivity over large +data to thin clients. + +[Pandas-Bokeh](https://github.com/PatrikHlobil/Pandas-Bokeh) provides a +high level API for Bokeh that can be loaded as a native Pandas plotting +backend via + +``` +pd.set_option("plotting.backend", "pandas_bokeh") +``` + +It is very similar to the matplotlib plotting backend, but provides +interactive web-based charts and maps. + +### [pygwalker](https://github.com/Kanaries/pygwalker) + +PyGWalker is an interactive data visualization and +exploratory data analysis tool built upon Graphic Walker +with support for visualization, cleaning, and annotation workflows. + +pygwalker can save interactively created charts +to Graphic-Walker and Vega-Lite JSON. + +``` +import pygwalker as pyg +pyg.walk(df) +``` + +### [seaborn](https://seaborn.pydata.org) + +Seaborn is a Python visualization library based on +[matplotlib](https://matplotlib.org). It provides a high-level, +dataset-oriented interface for creating attractive statistical graphics. +The plotting functions in seaborn understand pandas objects and leverage +pandas grouping operations internally to support concise specification +of complex visualizations. Seaborn also goes beyond matplotlib and +pandas with the option to perform statistical estimation while plotting, +aggregating across observations and visualizing the fit of statistical +models to emphasize patterns in a dataset. + +``` +import seaborn as sns +sns.set_theme() +``` + +### [plotnine](https://github.com/has2k1/plotnine/) + +Hadley Wickham's [ggplot2](https://ggplot2.tidyverse.org/) is a +foundational exploratory visualization package for the R language. Based +on ["The Grammar of +Graphics"](https://www.cs.uic.edu/~wilkinson/TheGrammarOfGraphics/GOG.html) +it provides a powerful, declarative and extremely general way to +generate bespoke plots of any kind of data. +Various implementations to other languages are available. +A good implementation for Python users is [has2k1/plotnine](https://github.com/has2k1/plotnine/). + +### [IPython Vega](https://github.com/vega/ipyvega) + +[IPython Vega](https://github.com/vega/ipyvega) leverages [Vega](https://github.com/vega/vega) to create plots within Jupyter Notebook. + +### [Plotly](https://plot.ly/python) + +[Plotly's](https://plot.ly/) [Python API](https://plot.ly/python/) +enables interactive figures and web shareability. Maps, 2D, 3D, and +live-streaming graphs are rendered with WebGL and +[D3.js](https://d3js.org/). The library supports plotting directly from +a pandas DataFrame and cloud-based collaboration. Users of [matplotlib, +ggplot for Python, and +Seaborn](https://plot.ly/python/matplotlib-to-plotly-tutorial/) can +convert figures into interactive web-based plots. Plots can be drawn in +[IPython Notebooks](https://plot.ly/ipython-notebooks/) , edited with R +or MATLAB, modified in a GUI, or embedded in apps and dashboards. Plotly +is free for unlimited sharing, and has +[cloud](https://plot.ly/product/plans/), +[offline](https://plot.ly/python/offline/), or +[on-premise](https://plot.ly/product/enterprise/) accounts for private +use. + +### [Lux](https://github.com/lux-org/lux) + +Lux is a Python library that facilitates fast and easy experimentation with data by automating the visual data exploration process. To use Lux, simply add an extra import alongside pandas: + +```python +import lux +import pandas as pd + +df = pd.read_csv("data.csv") +df # discover interesting insights! +``` + +By printing out a dataframe, Lux automatically [recommends a set of visualizations](https://raw.githubusercontent.com/lux-org/lux-resources/master/readme_img/demohighlight.gif) that highlights interesting trends and patterns in the dataframe. Users can leverage any existing pandas commands without modifying their code, while being able to visualize their pandas data structures (e.g., DataFrame, Series, Index) at the same time. Lux also offers a [powerful, intuitive language](https://lux-api.readthedocs.io/en/latest/source/guide/vis.html) that allow users to create Altair, matplotlib, or Vega-Lite visualizations without having to think at the level of code. + +### [D-Tale](https://github.com/man-group/dtale) + +D-Tale is a lightweight web client for visualizing pandas data structures. It +provides a rich spreadsheet-style grid which acts as a wrapper for a lot of +pandas functionality (query, sort, describe, corr...) so users can quickly +manipulate their data. There is also an interactive chart-builder using Plotly +Dash allowing users to build nice portable visualizations. D-Tale can be +invoked with the following command + +```python +import dtale + +dtale.show(df) +``` + +D-Tale integrates seamlessly with Jupyter notebooks, Python terminals, Kaggle +& Google Colab. Here are some demos of the [grid](http://alphatechadmin.pythonanywhere.com/dtale/main/1). + +### [hvplot](https://hvplot.holoviz.org/index.html) + +hvPlot is a high-level plotting API for the PyData ecosystem built on [HoloViews](https://holoviews.org/). +It can be loaded as a native pandas plotting backend via + +```python +pd.set_option("plotting.backend", "hvplot") +``` + +## IDE + +### [IPython](https://ipython.org/documentation.html) + +IPython is an interactive command shell and distributed computing +environment. IPython tab completion works with Pandas methods and also +attributes like DataFrame columns. + +### [Jupyter Notebook / Jupyter Lab](https://jupyter.org) + +Jupyter Notebook is a web application for creating Jupyter notebooks. A +Jupyter notebook is a JSON document containing an ordered list of +input/output cells which can contain code, text, mathematics, plots and +rich media. Jupyter notebooks can be converted to a number of open +standard output formats (HTML, HTML presentation slides, LaTeX, PDF, +ReStructuredText, Markdown, Python) through 'Download As' in the web +interface and `jupyter convert` in a shell. + +Pandas DataFrames implement `_repr_html_` and `_repr_latex` methods which +are utilized by Jupyter Notebook for displaying (abbreviated) HTML or +LaTeX tables. LaTeX output is properly escaped. (Note: HTML tables may +or may not be compatible with non-HTML Jupyter output formats.) + +See [Options and Settings](https://pandas.pydata.org/docs/user_guide/options.html) +for pandas `display.` settings. + +### [Spyder](https://www.spyder-ide.org/) + +Spyder is a cross-platform PyQt-based IDE combining the editing, +analysis, debugging and profiling functionality of a software +development tool with the data exploration, interactive execution, deep +inspection and rich visualization capabilities of a scientific +environment like MATLAB or Rstudio. + +Its [Variable +Explorer](https://docs.spyder-ide.org/current/panes/variableexplorer.html) allows +users to view, manipulate and edit pandas `Index`, `Series`, and +`DataFrame` objects like a "spreadsheet", including copying and +modifying values, sorting, displaying a "heatmap", converting data +types and more. Pandas objects can also be renamed, duplicated, new +columns added, copied/pasted to/from the clipboard (as TSV), and +saved/loaded to/from a file. Spyder can also import data from a variety +of plain text and binary files or the clipboard into a new pandas +DataFrame via a sophisticated import wizard. + +Most pandas classes, methods and data attributes can be autocompleted in +Spyder's [Editor](https://docs.spyder-ide.org/current/panes/editor.html) and [IPython +Console](https://docs.spyder-ide.org/current/panes/ipythonconsole.html), and Spyder's +[Help pane](https://docs.spyder-ide.org/current/panes/help.html) can retrieve and +render Numpydoc documentation on pandas objects in rich text with Sphinx +both automatically and on-demand. + +### [marimo](https://marimo.io) + +marimo is a reactive notebook for Python and SQL that enhances productivity when working with dataframes. It provides several features to make data manipulation and visualization more interactive and fun: + +1. Rich, interactive displays: marimo can display pandas dataframes in interactive tables or charts with filtering and sorting capabilities. +2. Data selection: Users can select data in tables or pandas-backed plots, and the selections are automatically sent to Python as pandas dataframes. +3. No-code transformations: Users can interactively transform pandas dataframes using a GUI, without writing code. The generated code can be copied and pasted into the notebook. +4. Custom filters: marimo allows the creation of pandas-backed filters using UI elements like sliders and dropdowns. +5. Dataset explorer: marimo automatically discovers and displays all dataframes in the notebook, allowing users to explore and visualize data interactively. +6. SQL integration: marimo allows users to write SQL queries against any pandas dataframes existing in memory. + +## API + +### [pandas-datareader](https://github.com/pydata/pandas-datareader) + +`pandas-datareader` is a remote data access library for pandas +(PyPI:`pandas-datareader`). It is based on functionality that was +located in `pandas.io.data` and `pandas.io.wb` but was split off in +v0.19. See more in the [pandas-datareader +docs](https://pandas-datareader.readthedocs.io/en/latest/): + +The following data feeds are available: + +- Google Finance +- Tiingo +- Morningstar +- IEX +- Robinhood +- Enigma +- Quandl +- FRED +- Fama/French +- World Bank +- OECD +- Eurostat +- TSP Fund Data +- Nasdaq Trader Symbol Definitions +- Stooq Index Data +- MOEX Data + +### [pandaSDMX](https://pandasdmx.readthedocs.io) + +pandaSDMX is a library to retrieve and acquire statistical data and +metadata disseminated in [SDMX](https://sdmx.org) 2.1, an +ISO-standard widely used by institutions such as statistics offices, +central banks, and international organisations. pandaSDMX can expose +datasets and related structural metadata including data flows, +code-lists, and data structure definitions as pandas Series or +MultiIndexed DataFrames. + +### [fredapi](https://github.com/mortada/fredapi) + +fredapi is a Python interface to the [Federal Reserve Economic Data +(FRED)](https://fred.stlouisfed.org/) provided by the Federal Reserve +Bank of St. Louis. It works with both the FRED database and ALFRED +database that contains point-in-time data (i.e. historic data +revisions). fredapi provides a wrapper in Python to the FRED HTTP API, +and also provides several convenient methods for parsing and analyzing +point-in-time data from ALFRED. fredapi makes use of pandas and returns +data in a Series or DataFrame. This module requires a FRED API key that +you can obtain for free on the FRED website. + +## Domain specific + +### [Geopandas](https://github.com/geopandas/geopandas) + +Geopandas extends pandas data objects to include geographic information +which support geometric operations. If your work entails maps and +geographical coordinates, and you love pandas, you should take a close +look at Geopandas. + +### [gurobipy-pandas](https://github.com/Gurobi/gurobipy-pandas) + +gurobipy-pandas provides a convenient accessor API to connect pandas with +gurobipy. It enables users to more easily and efficiently build mathematical +optimization models from data stored in DataFrames and Series, and to read +solutions back directly as pandas objects. + +### [staircase](https://github.com/staircase-dev/staircase) + +staircase is a data analysis package, built upon pandas and numpy, for modelling and +manipulation of mathematical step functions. It provides a rich variety of arithmetic +operations, relational operations, logical operations, statistical operations and +aggregations for step functions defined over real numbers, datetime and timedelta domains. + +### [xarray](https://github.com/pydata/xarray) + +xarray brings the labeled data power of pandas to the physical sciences +by providing N-dimensional variants of the core pandas data structures. +It aims to provide a pandas-like and pandas-compatible toolkit for +analytics on multi-dimensional arrays, rather than the tabular data for +which pandas excels. + +## IO + +### [NTV-pandas](https://github.com/loco-philippe/ntv-pandas) + +NTV-pandas provides a JSON converter with more data types than the ones supported by pandas directly. + +It supports the following data types: + +- pandas data types +- data types defined in the [NTV format](https://loco-philippe.github.io/ES/JSON%20semantic%20format%20(JSON-NTV).htm) +- data types defined in [Table Schema specification](https://datapackage.org/standard/table-schema/) + +The interface is always reversible (conversion round trip) with two formats (JSON-NTV and JSON-TableSchema). + +Example: + +```python +import ntv_pandas as npd + +jsn = df.npd.to_json(table=False) # save df as a JSON-value (format Table Schema if table is True else format NTV ) +df = npd.read_json(jsn) # load a JSON-value as a `DataFrame` + +df.equals(npd.read_json(df.npd.to_json(df))) # `True` in any case, whether `table=True` or not +``` + +### [BCPandas](https://github.com/yehoshuadimarsky/bcpandas) + +BCPandas provides high performance writes from pandas to Microsoft SQL Server, +far exceeding the performance of the native ``df.to_sql`` method. Internally, it uses +Microsoft's BCP utility, but the complexity is fully abstracted away from the end user. +Rigorously tested, it is a complete replacement for ``df.to_sql``. + +### [Deltalake](https://pypi.org/project/deltalake) + +Deltalake python package lets you access tables stored in +[Delta Lake](https://delta.io/) natively in Python without the need to use Spark or +JVM. It provides the ``delta_table.to_pyarrow_table().to_pandas()`` method to convert +any Delta table into Pandas dataframe. + +### [pandas-gbq](https://github.com/googleapis/python-bigquery-pandas) + +pandas-gbq provides high performance reads and writes to and from +[Google BigQuery](https://cloud.google.com/bigquery/). Previously (before version 2.2.0), +these methods were exposed as `pandas.read_gbq` and `DataFrame.to_gbq`. +Use `pandas_gbq.read_gbq` and `pandas_gbq.to_gbq`, instead. + + +### [ArcticDB](https://github.com/man-group/ArcticDB) + +ArcticDB is a serverless DataFrame database engine designed for the Python Data Science ecosystem. ArcticDB enables you to store, retrieve, and process pandas DataFrames at scale. It is a storage engine designed for object storage and also supports local-disk storage using LMDB. ArcticDB requires zero additional infrastructure beyond a running Python environment and access to object storage and can be installed in seconds. Please find full documentation [here](https://docs.arcticdb.io/latest/). + +#### ArcticDB Terminology + +ArcticDB is structured to provide a scalable and efficient way to manage and retrieve DataFrames, organized into several key components: + +- `Object Store` Collections of libraries. Used to separate logical environments from each other. Analogous to a database server. +- `Library` Contains multiple symbols which are grouped in a certain way (different users, markets, etc). Analogous to a database. +- `Symbol` Atomic unit of data storage. Identified by a string name. Data stored under a symbol strongly resembles a pandas DataFrame. Analogous to tables. +- `Version` Every modifying action (write, append, update) performed on a symbol creates a new version of that object. + +#### Installation + +To install, simply run: + +```console +pip install arcticdb +``` + +To get started, we can import ArcticDB and instantiate it: + +```python +import arcticdb as adb +import numpy as np +import pandas as pd +# this will set up the storage using the local file system +arctic = adb.Arctic("lmdb://arcticdb_test") +``` + +> **Note:** ArcticDB supports any S3 API compatible storage, including AWS. ArcticDB also supports Azure Blob storage. +> ArcticDB also supports LMDB for local/file based storage - to use LMDB, pass an LMDB path as the URI: `adb.Arctic('lmdb://path/to/desired/database')`. + +#### Library Setup + +ArcticDB is geared towards storing many (potentially millions) of tables. Individual tables (DataFrames) are called symbols and are stored in collections called libraries. A single library can store many symbols. Libraries must first be initialized prior to use: + +```python +lib = arctic.get_library('sample', create_if_missing=True) +``` + +#### Writing Data to ArcticDB + +Now we have a library set up, we can get to reading and writing data. ArcticDB has a set of simple functions for DataFrame storage. Let's write a DataFrame to storage. + +```python +df = pd.DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("20130101", periods=3) + } +) + +df +df.dtypes +``` + +Write to ArcticDB. + +```python +write_record = lib.write("test", df) +``` + +> **Note:** When writing pandas DataFrames, ArcticDB supports the following index types: +> +> - `pandas.Index` containing int64 (or the corresponding dedicated types Int64Index, UInt64Index) +> - `RangeIndex` +> - `DatetimeIndex` +> - `MultiIndex` composed of above supported types +> +> The "row" concept in `head`/`tail` refers to the row number ('iloc'), not the value in the `pandas.Index` ('loc'). + +#### Reading Data from ArcticDB + +Read the data back from storage: + +```python +read_record = lib.read("test") +read_record.data +df.dtypes +``` + +ArcticDB also supports appending, updating, and querying data from storage to a pandas DataFrame. Please find more information [here](https://docs.arcticdb.io/latest/api/query_builder/). + +### [Hugging Face](https://huggingface.co/datasets) + +The Hugging Face Dataset Hub provides a large collection of ready-to-use datasets for machine learning shared by the community. The platform offers a user-friendly interface to explore, discover and visualize datasets, and provides tools to easily load and work with these datasets in Python thanks to the [huggingface_hub](https://github.com/huggingface/huggingface_hub) library. + +You can access datasets on Hugging Face using `hf://` paths in pandas, in the form `hf://datasets/username/dataset_name/...`. + +For example, here is how to load the [stanfordnlp/imdb dataset](https://huggingface.co/datasets/stanfordnlp/imdb): + +```python +import pandas as pd + +# Load the IMDB dataset +df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/train-00000-of-00001.parquet") +``` + +Tip: on a dataset page, click on "Use this dataset" to get the code to load it in pandas. + +To save a dataset on Hugging Face you need to [create a public or private dataset](https://huggingface.co/new-dataset) and [login](https://huggingface.co/docs/huggingface_hub/quick-start#login-command), and then you can use `df.to_csv/to_json/to_parquet`: + +```python +# Save the dataset to my Hugging Face account +df.to_parquet("hf://datasets/username/dataset_name/train.parquet") +``` + +You can find more information about the Hugging Face Dataset Hub in the [documentation](https://huggingface.co/docs/hub/en/datasets). + +## Out-of-core + +### [Bodo](https://github.com/bodo-ai/Bodo) + + +Bodo is a high-performance compute engine for Python data processing. +Using an auto-parallelizing just-in-time (JIT) compiler, Bodo simplifies scaling Pandas +workloads from laptops to clusters without major code changes. +Under the hood, Bodo relies on MPI-based high-performance computing (HPC) technology—making it +both easier to use and often much faster than alternatives. +Bodo also provides a SQL engine that can query distributed pandas dataframes efficiently. + +```python +import pandas as pd +import bodo + +@bodo.jit +def process_data(): + df = pd.read_parquet("my_data.pq") + df2 = pd.DataFrame({"A": df.apply(lambda r: 0 if r.A == 0 else (r.B // r.A), axis=1)}) + df2.to_parquet("out.pq") + +process_data() +``` + + +### [Cylon](https://cylondata.org/) + +Cylon is a fast, scalable, distributed memory parallel runtime with a pandas +like Python DataFrame API. ”Core Cylon” is implemented with C++ using Apache +Arrow format to represent the data in-memory. Cylon DataFrame API implements +most of the core operators of pandas such as merge, filter, join, concat, +group-by, drop_duplicates, etc. These operators are designed to work across +thousands of cores to scale applications. It can interoperate with pandas +DataFrame by reading data from pandas or converting data to pandas so users +can selectively scale parts of their pandas DataFrame applications. + +```python +from pycylon import read_csv, DataFrame, CylonEnv +from pycylon.net import MPIConfig + +# Initialize Cylon distributed environment +config: MPIConfig = MPIConfig() +env: CylonEnv = CylonEnv(config=config, distributed=True) + +df1: DataFrame = read_csv('/tmp/csv1.csv') +df2: DataFrame = read_csv('/tmp/csv2.csv') + +# Using 1000s of cores across the cluster to compute the join +df3: Table = df1.join(other=df2, on=[0], algorithm="hash", env=env) + +print(df3) +``` + +### [Dask](https://docs.dask.org) + +Dask is a flexible parallel computing library for analytics. Dask +provides a familiar `DataFrame` interface for out-of-core, parallel and +distributed computing. + +### [Dask-ML](https://ml.dask.org) + +Dask-ML enables parallel and distributed machine learning using Dask +alongside existing machine learning libraries like Scikit-Learn, +XGBoost, and TensorFlow. + +### [Ibis](https://ibis-project.org/docs/) + +Ibis offers a standard way to write analytics code, that can be run in multiple engines. It helps in bridging the gap between local Python environments (like pandas) and remote storage and execution systems like Hadoop components (like HDFS, Impala, Hive, Spark) and SQL databases (Postgres, etc.). + + +### [Koalas](https://koalas.readthedocs.io/en/latest/) + +Koalas provides a familiar pandas DataFrame interface on top of Apache +Spark. It enables users to leverage multi-cores on one machine or a +cluster of machines to speed up or scale their DataFrame code. + +### [Modin](https://github.com/modin-project/modin) + +The ``modin.pandas`` DataFrame is a parallel and distributed drop-in replacement +for pandas. This means that you can use Modin with existing pandas code or write +new code with the existing pandas API. Modin can leverage your entire machine or +cluster to speed up and scale your pandas workloads, including traditionally +time-consuming tasks like ingesting data (``read_csv``, ``read_excel``, +``read_parquet``, etc.). + +```python +# import pandas as pd +import modin.pandas as pd + +df = pd.read_csv("big.csv") # use all your cores! +``` + +### [Pandarallel](https://github.com/nalepae/pandarallel) + +Pandarallel provides a simple way to parallelize your pandas operations on all your CPUs by changing only one line of code. +It also displays progress bars. + +```python +from pandarallel import pandarallel + +pandarallel.initialize(progress_bar=True) + +# df.apply(func) +df.parallel_apply(func) +``` + +### [Vaex](https://vaex.io/docs/) + +Increasingly, packages are being built on top of pandas to address +specific needs in data preparation, analysis and visualization. Vaex is +a python library for Out-of-Core DataFrames (similar to Pandas), to +visualize and explore big tabular datasets. It can calculate statistics +such as mean, sum, count, standard deviation etc, on an N-dimensional +grid up to a billion (10^9) objects/rows per second. Visualization is +done using histograms, density plots and 3d volume rendering, allowing +interactive exploration of big data. Vaex uses memory mapping, zero +memory copy policy and lazy computations for best performance (no memory +wasted). + +- ``vaex.from_pandas`` +- ``vaex.to_pandas_df`` + +### [Hail Query](https://hail.is/) + +An out-of-core, preemptible-safe, distributed, dataframe library serving +the genetics community. Hail Query ships with on-disk data formats, +in-memory data formats, an expression compiler, a query planner, and a +distributed sort algorithm all designed to accelerate queries on large +matrices of genome sequencing data. + +It is often easiest to use pandas to manipulate the summary statistics or +other small aggregates produced by Hail. For this reason, Hail provides +native import to and export from pandas DataFrames: + +- [`Table.from_pandas`](https://hail.is/docs/latest/hail.Table.html#hail.Table.from_pandas) +- [`Table.to_pandas`](https://hail.is/docs/latest/hail.Table.html#hail.Table.to_pandas) + +## Data cleaning and validation + +### [pyjanitor](https://github.com/pyjanitor-devs/pyjanitor) + +Pyjanitor provides a clean API for cleaning data, using method chaining. + +### [Pandera](https://pandera.readthedocs.io/en/stable/) + +Pandera provides a flexible and expressive API for performing data validation on dataframes +to make data processing pipelines more readable and robust. +Dataframes contain information that pandera explicitly validates at runtime. This is useful in +production-critical data pipelines or reproducible research settings. + +## Extension data types + +Pandas provides an interface for defining +[extension types](https://pandas.pydata.org/docs/development/extending.html#extension-types) to extend NumPy's type system. +The following libraries implement that interface to provide types not found in NumPy or pandas, +which work well with pandas' data containers. + +### [awkward-pandas](https://awkward-pandas.readthedocs.io/) + +Awkward-pandas provides an extension type for storing [Awkward +Arrays](https://awkward-array.org/) inside pandas' Series and +DataFrame. It also provides an accessor for using awkward functions +on Series that are of awkward type. + +### [db-dtypes](https://github.com/googleapis/python-db-dtypes-pandas) + +db-dtypes provides an extension types for working with types like +DATE, TIME, and JSON from database systems. This package is used +by pandas-gbq to provide natural dtypes for BigQuery data types without +a natural numpy type. + +### [Pandas-Genomics](https://pandas-genomics.readthedocs.io/en/latest/) + +Pandas-Genomics provides an extension type and extension array for working + with genomics data. It also includes `genomics` accessors for many useful properties + and methods related to QC and analysis of genomics data. + +### [Physipandas](https://github.com/mocquin/physipandas) + +Physipandas provides an extension for manipulating physical quantities + (like scalar and numpy.ndarray) in association with a physical unit + (like meter or joule) and additional features for integration of + `physipy` accessors with pandas Series and Dataframe. + +### [Pint-Pandas](https://github.com/hgrecco/pint-pandas) + +Pint-Pandas provides an extension type for storing numeric arrays with units. +These arrays can be stored inside pandas' Series and DataFrame. Operations +between Series and DataFrame columns which use pint's extension array are then +units aware. + +### [Text Extensions](https://ibm.biz/text-extensions-for-pandas) + +Text Extensions for Pandas provides extension types to cover common data structures for representing natural language data, plus library integrations that convert the outputs of popular natural language processing libraries into pandas DataFrames. + +## Accessors + +A directory of projects providing +[extension accessors](https://pandas.pydata.org/docs/development/extending.html#registering-custom-accessors). +This is for users to discover new accessors and for library +authors to coordinate on the namespace. + + | Library | Accessor | Classes | + | -------------------------------------------------------------------- | ---------- | --------------------- | + | [awkward-pandas](https://awkward-pandas.readthedocs.io/en/latest/) | `ak` | `Series` | + | [pdvega](https://altair-viz.github.io/pdvega/) | `vgplot` | `Series`, `DataFrame` | + | [pandas-genomics](https://pandas-genomics.readthedocs.io/en/latest/) | `genomics` | `Series`, `DataFrame` | + | [pint-pandas](https://github.com/hgrecco/pint-pandas) | `pint` | `Series`, `DataFrame` | + | [physipandas](https://github.com/mocquin/physipandas) | `physipy` | `Series`, `DataFrame` | + | [composeml](https://github.com/alteryx/compose) | `slice` | `DataFrame` | + | [gurobipy-pandas](https://github.com/Gurobi/gurobipy-pandas) | `gppd` | `Series`, `DataFrame` | + | [staircase](https://www.staircase.dev/) | `sc` | `Series`, `DataFrame` | + | [woodwork](https://github.com/alteryx/woodwork) | `slice` | `Series`, `DataFrame` | + +## Development tools + +### [pandas-stubs](https://github.com/VirtusLab/pandas-stubs) + +While pandas repository is partially typed, the package itself doesn't expose this information for external use. +Install pandas-stubs to enable basic type coverage of pandas API. + +Learn more by reading through these issues [14468](https://github.com/pandas-dev/pandas/issues/14468), +[26766](https://github.com/pandas-dev/pandas/issues/26766), [28142](https://github.com/pandas-dev/pandas/issues/28142). + +See installation and usage instructions on the [GitHub page](https://github.com/VirtusLab/pandas-stubs). + +### [Hamilton](https://github.com/dagworks-inc/hamilton) + +Hamilton is a declarative dataflow framework that came out of Stitch Fix. It was designed to help one manage a +Pandas code base, specifically with respect to feature engineering for machine learning models. + +It prescribes an opinionated paradigm, that ensures all code is: + +* unit testable +* integration testing friendly +* documentation friendly +* transformation logic is reusable, as it is decoupled from the context of where it is used. +* integratable with runtime data quality checks. + +This helps one to scale your pandas code base, at the same time, keeping maintenance costs low. + +For more information, see [documentation](https://hamilton.readthedocs.io/). diff --git a/web/pandas/content/pt/config.yml b/web/pandas/content/pt/config.yml new file mode 100644 index 0000000000000..a52295f3b987f --- /dev/null +++ b/web/pandas/content/pt/config.yml @@ -0,0 +1,198 @@ +main: + base_url: "pt" + templates_path: _templates + base_template: "layout.html" + production_url: "https://pandas.pydata.org/" + ignore: + - _templates/layout.html + - config.yml + github_repo_url: pandas-dev/pandas + context_preprocessors: + - pandas_web.Preprocessors.current_year + - pandas_web.Preprocessors.navbar_add_info + - pandas_web.Preprocessors.blog_add_posts + - pandas_web.Preprocessors.maintainers_add_info + - pandas_web.Preprocessors.home_add_releases + - pandas_web.Preprocessors.roadmap_pdeps + markdown_extensions: + - toc + - tables + - fenced_code + - meta + - footnotes + - codehilite +static: + logo: static/img/pandas_white.svg + css: + - static/css/pandas.css + - static/css/codehilite.css +navbar: + - name: "Quem somos" + target: + - name: "Sobre o pandas" + target: about/ + - name: "Planejamento (roadmap)" + target: about/roadmap.html + - name: "Governança" + target: about/governance.html + - name: "Equipe" + target: about/team.html + - name: "Patrocinadores" + target: about/sponsors.html + - name: "Citações e logo" + target: about/citing.html + - name: "Comece aqui" + target: getting_started.html + - name: "Documentação" + target: docs/ + - name: "Comunidade" + target: + - name: "Blog" + target: community/blog/ + - name: "Faça uma pergunta (StackOverflow)" + target: https://stackoverflow.com/questions/tagged/pandas + - name: "Código de conduta" + target: community/coc.html + - name: "Ecossistema" + target: community/ecosystem.html + - name: "Benchmarks" + target: community/benchmarks.html + - name: "Contribuir" + target: contribute.html + +blog: + num_posts: 50 + posts_path: community/blog + author: "pandas team" + feed_name: "pandas blog" + feed: + - https://wesmckinney.com/feeds/pandas.atom.xml + - https://tomaugspurger.github.io/feed + - https://jorisvandenbossche.github.io/feeds/pandas.atom.xml + - https://datapythonista.me/blog/feeds/pandas.atom.xml + - https://numfocus.org/tag/pandas/feed/ + - https://phofl.github.io/feeds/pandas.atom.xml +maintainers: + active: + - jorisvandenbossche + - TomAugspurger + - jreback + - WillAyd + - mroeschke + - jbrockmendel + - datapythonista + - simonjayhawkins + - topper-123 + - alimcmaster1 + - bashtage + - Dr-Irv + - rhshadrach + - phofl + - attack68 + - fangchenli + - lithomas1 + - lukemanley + - noatamir + inactive: + - lodagro + - jseabold + - jtratner + - shoyer + - chris-b1 + - sinhrks + - cpcloud + - toobaz + - jschendel + - charlesdong1991 + - dsaxton + - wesm + - gfyoung + - mzeitlin11 + - twoertwein + - MarcoGorelli +workgroups: + coc: + name: Código de conduta + contact: coc@pandas.pydata.org + responsibilities: "Make sure pandas is the welcoming and inclusive community we want it to be. Keeping the CoC updated, and addressing violation reports." + members: + - Tom Augspurger + - Bijay Regmi + - Wuraola Oyewusi + - Мария Чакчурина + finance: + name: Finanças + contact: finance@pandas.pydata.org + responsibilities: "Manage the funding. Coordinate the request of grants. Approve the project expenses." + members: + - Matthew Roeschke + - Jeff Reback + - Joris Van den Bossche + - Patrick Hoefler + infrastructure: + name: Infraestrutura + contact: infrastructure@pandas.pydata.org + responsibilities: "Keep the pandas infrastructure up and working. In particular the servers for the website, benchmarks, CI and others needed." + members: + - William Ayd + - Thomas Li + communications: + name: Communicações + contact: communications@pandas.pydata.org + responsibilities: "Share relevant information with the broader community, mainly via our social networks, as well as being the main point of contact between NumFOCUS and the core team." + members: + - datapythonista +sponsors: + active: + - name: "NumFOCUS" + url: https://numfocus.org/ + logo: static/img/partners/numfocus.svg + kind: numfocus + - name: "Coiled" + url: https://www.coiled.io + logo: static/img/partners/coiled.svg + kind: partner + description: "Patrick Hoefler" + - name: "Nvidia" + url: https://www.nvidia.com + logo: static/img/partners/nvidia.svg + kind: partner + description: "Matthew Roeschke" + - name: "Tidelift" + url: https://tidelift.com + logo: static/img/partners/tidelift.svg + kind: regular + description: "pandas is part of the Tidelift subscription. You can support pandas by becoming a Tidelift subscriber." + - name: "Bodo" + url: https://www.bodo.ai/ + logo: static/img/partners/bodo.svg + kind: regular + description: "Bodo's parallel computing platform uses pandas API, and Bodo financially supports pandas development to help improve pandas, in particular the pandas API" + inkind: # not included in active so they don't appear in the home page + - name: "OVH" + url: https://us.ovhcloud.com/ + description: "Website and documentation hosting." + - name: "Indeed" + url: https://opensource.indeedeng.io/ + description: "pandas logo design" + past: + - name: "Paris-Saclay Center for Data Science" + url: https://www.datascience-paris-saclay.fr/ + kind: partner + - name: "Anaconda" + url: https://www.anaconda.com/ + kind: partner + - name: "RStudio" + url: https://www.rstudio.com/ + kind: partner + - name: "Ursa Labs" + url: https://ursalabs.org/ + kind: partner + - name: "Gousto" + url: https://www.gousto.co.uk/ + kind: partner + - name: "d-fine GmbH" + url: https://www.d-fine.com/en/ + kind: partner +roadmap: + pdeps_path: pdeps diff --git a/web/pandas/content/pt/contribute.md b/web/pandas/content/pt/contribute.md new file mode 100644 index 0000000000000..3307ddcfb1a0a --- /dev/null +++ b/web/pandas/content/pt/contribute.md @@ -0,0 +1,55 @@ +# Contribute to pandas + +_pandas_ is and will always be **free**. To make the development sustainable, we need _pandas_ users, corporate +and individual, to support the development by providing their time and money. + +You can find more information about current developers in the [team page]({{ base_url }}about/team.html), +and about current sponsors in the [sponsors page]({{ base_url }}about/sponsors.html). + +
+
+
+
+ + + + +

Corporate support

+

+ pandas depends on companies and institutions using the software to support its development. Hiring + people to work on pandas, or letting existing employees to contribute to the + software. Or sponsoring pandas with funds, so the project can hire people to + progress on the pandas roadmap. +

+

More information in the sponsors page

+
+
+ + + + +

Individual contributors

+

+ pandas is mostly developed by volunteers. All kind of contributions are welcome, + such as contributions to the code, to the website (including graphical designers), + to the documentation (including translators) and others. There are tasks for all + levels, including beginners. +

+

More information in the contributing page

+
+
+ + + + +

Donations

+

+ Individual donations are appreciated, and are used for things like the project + infrastructure, travel expenses for our volunteer contributors to attend + the in-person sprints, or to give small grants to develop features. +

+

Make your donation in the donate page

+
+
+
+
diff --git a/web/pandas/content/pt/donate.md b/web/pandas/content/pt/donate.md new file mode 100644 index 0000000000000..69db7e4648e77 --- /dev/null +++ b/web/pandas/content/pt/donate.md @@ -0,0 +1,14 @@ +# Donate to pandas + +
+
+ + +_pandas_ is a Sponsored Project of [NumFOCUS](https://numfocus.org/), a 501(c)(3) nonprofit charity in the United States. +NumFOCUS provides _pandas_ with fiscal, legal, and administrative support to help ensure the +health and sustainability of the project. Visit numfocus.org for more information. + +Donations to _pandas_ are managed by NumFOCUS. For donors in the United States, your gift is tax-deductible +to the extent provided by law. As with any donation, you should consult with your tax adviser about your particular tax situation. diff --git a/web/pandas/content/pt/getting_started.md b/web/pandas/content/pt/getting_started.md new file mode 100644 index 0000000000000..c556eda57ac31 --- /dev/null +++ b/web/pandas/content/pt/getting_started.md @@ -0,0 +1,44 @@ +# Getting started + +## Installation instructions + +To install pandas, please reference the [installation page]({{ base_url}}docs/getting_started/install.html) +from the pandas documentation. + +## Tutorials + +You can learn more about pandas in the [tutorials]({{ base_url }}docs/getting_started/intro_tutorials/), +and more about JupyterLab in the +[JupyterLab documentation](https://jupyterlab.readthedocs.io/en/stable/user/interface.html). + +## Books + +The book we recommend to learn pandas is [Python for Data Analysis](https://amzn.to/3DyLaJc), +by [Wes McKinney](https://wesmckinney.com/), creator of pandas. + + + Python for Data Analysis + + +## Videos + + + +## Cheat sheet + +[pandas cheat sheet](https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf) + +## Try pandas in your browser (experimental) + +You can try pandas in your browser with the following interactive shell +without needing to install anything on your system. + +

+ Try it in your browser +

diff --git a/web/pandas/content/pt/index.html b/web/pandas/content/pt/index.html new file mode 100644 index 0000000000000..bbd8632e06840 --- /dev/null +++ b/web/pandas/content/pt/index.html @@ -0,0 +1,136 @@ +{% extends "layout.html" %} +{% block body %} +
+
+
+
+

pandas

+

+ pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool,
+ built on top of the Python programming language. +

+

+ Install pandas now! +

+
+ +
+
+
Getting started
+ +
+ +
+
Community
+ +
+
+
+
With the support of:
+ {% for row in sponsors.active | batch(6, "") %} +
+ {% for company in row %} +
+ {% if company %} + + {{ company.name }} + + {% endif %} +
+ {% endfor %} +
+ {% endfor %} +

The full list of companies supporting pandas is available in the sponsors page. +

+
+
+ {% if releases %} +

Latest version: {{ releases[0].name }}

+ + {% endif %} +

Follow us

+
+ + +
+

Recommended books

+

+ + Python for Data Analysis + +

+

+ + Effective pandas 2 + +

+ {% if releases[1:5] %} +

Previous versions

+
    + {% for release in releases[1:5] %} +
  • + {{ release.name }} ({{ release.published.strftime("%b %d, %Y") }})
    + changelog | + docs | + code +
  • + {% endfor %} +
+ {% endif %} + {% if releases[5:] %} +

+ +

+
    + {% for release in releases[5:] %} +
  • + {{ release.name }} ({{ release.published.strftime("%b %d, %Y") }})
    + changelog | + docs | + code +
  • + {% endfor %} +
+ {% endif %} +
+
+
+ +{% endblock %} diff --git a/web/pandas/static/js/language_switcher.js b/web/pandas/static/js/language_switcher.js new file mode 100644 index 0000000000000..d9742a811be93 --- /dev/null +++ b/web/pandas/static/js/language_switcher.js @@ -0,0 +1,60 @@ +window.addEventListener("DOMContentLoaded", function() { + + // ABS_BASE_URL is the full URL + // e.g. https://pandas.pydata.org/en/getting_started.html + var ABS_BASE_URL = document.baseURI; + var CURRENT_LANGUAGE = document.documentElement.lang; + var languages = JSON.parse(document.getElementById("languages").getAttribute('data-lang').replace(/'/g, '"')); + + const language_names = { + 'en': 'English', + 'es': 'Español', + 'zh': '中文', + 'ja': '日本語', + 'ko': '한국어', + 'fr': 'Français', + 'pt': 'Português' + } + + // Create dropdown menu + function makeDropdown(options) { + var dropdown = document.createElement("li"); + dropdown.classList.add("nav-item"); + dropdown.classList.add("dropdown"); + + var link = document.createElement("a"); + link.classList.add("nav-link"); + link.classList.add("dropdown-toggle"); + link.setAttribute("data-bs-toggle", "dropdown"); + link.setAttribute("href", "#"); + link.setAttribute("role", "button"); + link.setAttribute("aria-haspopup", "true"); + link.setAttribute("aria-expanded", "false"); + link.textContent = language_names[CURRENT_LANGUAGE]; + + var dropdownMenu = document.createElement("div"); + dropdownMenu.classList.add("dropdown-menu"); + + options.forEach(function(i) { + var dropdownItem = document.createElement("a"); + dropdownItem.classList.add("dropdown-item"); + dropdownItem.textContent = language_names[i] || i.toUpperCase(); + dropdownItem.setAttribute("href", "#"); + dropdownItem.addEventListener("click", function() { + var newUrl = ABS_BASE_URL.replace('/' + CURRENT_LANGUAGE + '/', '/' + i + '/'); + window.location.href = newUrl; + }); + dropdownMenu.appendChild(dropdownItem); + }); + + dropdown.appendChild(link); + dropdown.appendChild(dropdownMenu); + return dropdown; + } + + var container = document.getElementById("language-switcher-container"); + if (container) { + var dropdown = makeDropdown(languages); + container.appendChild(dropdown); + } +}); diff --git a/web/pandas_web.py b/web/pandas_web.py index b3872b829c73a..6dc931d228231 100755 --- a/web/pandas_web.py +++ b/web/pandas_web.py @@ -291,7 +291,8 @@ def roadmap_pdeps(context): # accepted, rejected and implemented pdeps_path = ( - pathlib.Path(context["source_path"]) / context["roadmap"]["pdeps_path"] + pathlib.Path(context["source_path"]).parent + / context["roadmap"]["pdeps_path"] ) for pdep in sorted(pdeps_path.iterdir()): if pdep.suffix != ".md": @@ -434,6 +435,7 @@ def extend_base_template(content: str, base_template: str) -> str: def main( source_path: str, target_path: str, + languages: str, ) -> int: """ Copy every file in the source directory to the target directory. @@ -441,58 +443,90 @@ def main( For ``.md`` and ``.html`` files, render them with the context before copying them. ``.md`` files are transformed to HTML. """ - config_fname = os.path.join(source_path, "config.yml") - - shutil.rmtree(target_path, ignore_errors=True) - os.makedirs(target_path, exist_ok=True) - - sys.stderr.write("Generating context...\n") - context = get_context(config_fname, target_path=target_path) - sys.stderr.write("Context generated\n") - - templates_path = os.path.join(source_path, context["main"]["templates_path"]) - jinja_env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_path)) - - for fname in get_source_files(source_path): - if os.path.normpath(fname) in context["main"]["ignore"]: - continue + # Read all subfolder names in the source directory + if not languages: + languages = os.listdir(source_path).remove("pdeps") + + for language in languages: + source_path_lang = os.path.join(source_path, language) + target_path_lang = os.path.join(target_path, language) + pdep_path = os.path.join(source_path, "pdeps") + static_path = os.path.join(pathlib.Path(source_path).parent, "static") + config_fname = os.path.join(source_path_lang, "config.yml") + + shutil.rmtree(target_path_lang, ignore_errors=True) + os.makedirs(target_path_lang, exist_ok=True) + shutil.copytree(pdep_path, os.path.join(target_path_lang, "pdeps")) + shutil.copytree(static_path, os.path.join(target_path_lang, "static")) + + sys.stderr.write(f"Generating context for {language}...\n") + context = get_context(config_fname, target_path=target_path_lang) + context["selected_language"] = language + context["languages"] = languages + sys.stderr.write("Context generated\n") + + templates_path = ( + pathlib.Path(source_path).parent / context["main"]["templates_path"] + ) + jinja_env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_path)) - sys.stderr.write(f"Processing {fname}\n") - dirname = os.path.dirname(fname) - os.makedirs(os.path.join(target_path, dirname), exist_ok=True) + for fname in get_source_files(source_path_lang): + if os.path.normpath(fname) in context["main"]["ignore"]: + continue - extension = os.path.splitext(fname)[-1] - if extension in (".html", ".md"): - with open(os.path.join(source_path, fname), encoding="utf-8") as f: - content = f.read() - if extension == ".md": - body = markdown.markdown( - content, extensions=context["main"]["markdown_extensions"] + sys.stderr.write(f"Processing {fname}\n") + dirname = os.path.dirname(fname) + os.makedirs(os.path.join(target_path_lang, dirname), exist_ok=True) + + extension = os.path.splitext(fname)[-1] + if extension in (".html", ".md"): + with open(os.path.join(source_path_lang, fname), encoding="utf-8") as f: + content = f.read() + if extension == ".md": + body = markdown.markdown( + content, extensions=context["main"]["markdown_extensions"] + ) + # Apply Bootstrap's table formatting manually + # Python-Markdown doesn't let us config table attributes by hand + body = body.replace( + "", '
' + ) + content = extend_base_template( + body, context["main"]["base_template"] + ) + context["base_url"] = "".join( + ["../"] * os.path.normpath(fname).count("/") + ) + content = jinja_env.from_string(content).render(**context) + fname_html = os.path.splitext(fname)[0] + ".html" + with open( + os.path.join(target_path_lang, fname_html), "w", encoding="utf-8" + ) as f: + f.write(content) + else: + shutil.copy( + os.path.join(source_path_lang, fname), + os.path.join(target_path_lang, dirname), ) - # Apply Bootstrap's table formatting manually - # Python-Markdown doesn't let us config table attributes by hand - body = body.replace("
", '
') - content = extend_base_template(body, context["main"]["base_template"]) - context["base_url"] = "".join(["../"] * os.path.normpath(fname).count("/")) - content = jinja_env.from_string(content).render(**context) - fname_html = os.path.splitext(fname)[0] + ".html" - with open( - os.path.join(target_path, fname_html), "w", encoding="utf-8" - ) as f: - f.write(content) - else: - shutil.copy( - os.path.join(source_path, fname), os.path.join(target_path, dirname) - ) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Documentation builder.") parser.add_argument( - "source_path", help="path to the source directory (must contain config.yml)" + "source_path", + help="path to the source directory (must contain each language folder)", + ) + # For each language, the output will be written in a subdirectory named + # after the language (default: build/en) + parser.add_argument( + "--target-path", default="build", help="directory where to write the output." ) + # e.g. python pandas_web.py --source_path pandas/content --languages en pt parser.add_argument( - "--target-path", default="build", help="directory where to write the output" + "--languages", + nargs="*", + default="en", + help="language codes to build (default: en)", ) args = parser.parse_args() - sys.exit(main(args.source_path, args.target_path)) + sys.exit(main(args.source_path, args.target_path, args.languages))