From a1c6a5fb967297f20f85c711db4f475953d22c6f Mon Sep 17 00:00:00 2001 From: juvizueteva Date: Wed, 26 Mar 2025 01:42:21 +0000 Subject: [PATCH 1/4] primer cambio --- lessons/02_bag_of_words.ipynb | 233 ++++++++++++++++++---------------- 1 file changed, 122 insertions(+), 111 deletions(-) diff --git a/lessons/02_bag_of_words.ipynb b/lessons/02_bag_of_words.ipynb index cbc9046..25868f6 100644 --- a/lessons/02_bag_of_words.ipynb +++ b/lessons/02_bag_of_words.ipynb @@ -75,7 +75,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 9, "id": "f3862ffd-918f-4184-8c90-8a39a8a2a069", "metadata": {}, "outputs": [], @@ -104,7 +104,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 19, "id": "4190e351-97b7-4c5b-866e-07aa6cbd42c2", "metadata": {}, "outputs": [], @@ -116,7 +116,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 20, "id": "79acbaf2-6625-4abb-b50f-97ea54ba0d11", "metadata": {}, "outputs": [ @@ -290,7 +290,7 @@ "4 2015-02-24 11:14:45 -0800 NaN Pacific Time (US & Canada) " ] }, - "execution_count": 3, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -316,7 +316,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 21, "id": "a1faaf90-8c01-4d25-9468-90c01823f0d5", "metadata": {}, "outputs": [], @@ -334,7 +334,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 22, "id": "438830e6-1064-47fe-b578-a1ca693a0ed0", "metadata": {}, "outputs": [ @@ -369,13 +369,13 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 23, "id": "01955158-6954-447a-acb6-2989d02a49c3", "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -404,7 +404,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 24, "id": "428ddde7-af73-4eb6-92c9-041a1791ca59", "metadata": {}, "outputs": [ @@ -417,7 +417,7 @@ "Name: retweet_count, dtype: float64" ] }, - "execution_count": 7, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -439,7 +439,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 25, "id": "12aa9f2d-d655-494a-bb72-08ad973518f3", "metadata": {}, "outputs": [ @@ -519,7 +519,7 @@ "Virgin America 0.543544 0.456456" ] }, - "execution_count": 8, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -581,19 +581,30 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 14, "id": "21738b02-9ab9-4a61-b41f-ff75888aa747", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/workspaces/Python-Text-Analysis_grupo_4/lessons/utils.py:4: SyntaxWarning: invalid escape sequence '\\d'\n", + " digit_pattern = '\\d+'\n", + "/workspaces/Python-Text-Analysis_grupo_4/lessons/utils.py:14: SyntaxWarning: invalid escape sequence '\\d'\n", + " digit_pattern = '\\d+'\n" + ] + } + ], "source": [ "from utils import placeholder" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "id": "03569f0d-34ba-492d-aa1d-1dce9d34f792", "metadata": {}, "outputs": [], @@ -618,7 +629,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 27, "id": "8990cefd-5d04-46ba-ada2-29978c28cfe8", "metadata": {}, "outputs": [ @@ -628,7 +639,7 @@ "text": [ "lol @justinbeiber and @BillGates are like soo 2000 #yesterday #amiright saw it on https://twitter.com #yolo\n", "==================================================\n", - "lol USER and USER are like soo DIGIT HASHTAG HASHTAG saw it on URL HASHTAG\n" + "Ellipsis\n" ] } ], @@ -645,7 +656,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 28, "id": "a5f7bb6a-f064-48cc-b650-12c4ef2fbb88", "metadata": { "scrolled": true @@ -654,15 +665,15 @@ { "data": { "text/plain": [ - "0 USER plus you've added commercials to the expe...\n", - "1 USER it's really aggressive to blast obnoxious...\n", - "2 USER and it's a really big bad thing about it\n", - "3 USER seriously would pay $ DIGIT a flight for ...\n", - "4 USER yes, nearly every time i fly vx this “ear...\n", + "0 Ellipsis\n", + "1 Ellipsis\n", + "2 Ellipsis\n", + "3 Ellipsis\n", + "4 Ellipsis\n", "Name: text_processed, dtype: object" ] }, - "execution_count": 12, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -687,17 +698,17 @@ "metadata": {}, "source": [ "\n", - "# The Bag-of-Words Representation\n", + "# La Representación Bag-of-Words\n", "\n", - "The idea of bag-of-words (BoW), as the name suggests, is quite intuitive: we take a document and toss it in a bag. The action of \"throwing\" the document in a bag disregards the relative position between words, so what is \"in the bag\" is essentially \"an unsorted set of words\" [(Jurafsky & Martin, 2024)](https://web.stanford.edu/~jurafsky/slp3/ed3book.pdf). In return, we have a list of unique words and the frequency of each of them. \n", + "La idea de bag-of-words (BoW), como sugiere el nombre, es bastante intuitiva: tomamos un documento y lo arrojamos en una bolsa. La acción de \"arrojar\" el documento en una bolsa ignora la posición relativa entre las palabras, por lo que lo que queda \"en la bolsa\" es esencialmente \"un conjunto desordenado de palabras\" [(Jurafsky & Martin, 2024)](https://web.stanford.edu/~jurafsky/slp3/ed3book.pdf). A cambio, obtenemos una lista de palabras únicas y la frecuencia de cada una de ellas. \n", "\n", - "For example, as shown in the following illustration, the word \"coffee\" appears twice. \n", + "Por ejemplo, como se muestra en la siguiente ilustración, la palabra \"coffee\" aparece dos veces. \n", "\n", "\"BoW-Part2\"\n", "\n", - "With a bag-of-words representation, we make heavy use of word frequency but not too much of word order. \n", + "Con una representación bag-of-words, hacemos un uso intensivo de la frecuencia de las palabras, pero no tanto del orden en que aparecen. \n", "\n", - "In the context of sentiment analysis, the sentiment of a tweet is conveyed more strongly by specific words. For example, if a tweet contains the word \"happy,\" it likely conveys positive sentiment, but not always (e.g., \"not happy\" denotes the opposite sentiment). When these words come up more often, they'll probably more strongly convey the sentiment." + "En el contexto del análisis de sentimiento, el sentimiento de un tweet se transmite más fuertemente a través de palabras específicas. Por ejemplo, si un tweet contiene la palabra \"happy\", es probable que transmita un sentimiento positivo, aunque no siempre (por ejemplo, \"not happy\" denota el sentimiento opuesto). Cuando estas palabras aparecen con mayor frecuencia, probablemente transmitirán el sentimiento con más fuerza.\n" ] }, { @@ -707,13 +718,13 @@ "source": [ "## Document Term Matrix\n", "\n", - "Now let's implement the idea of bag-of-words. Before we dive deeper, let's step back for a moment. In practice, text analysis often involves handling many documents; from now on, we use the term **document** to represent a piece of text on which we perform analysis. It could be a phrase, a sentence, a tweet, or any other text—as long as it can be represented by a string, the length dosen't really matter. \n", + "Ahora implementemos la idea de bag-of-words. Antes de profundizar, retrocedamos un momento. En la práctica, el análisis de texto a menudo implica manejar múltiples documentos; de ahora en adelante, utilizaremos el término **document** para representar un fragmento de texto sobre el cual realizamos análisis. Puede ser una frase, una oración, un tweet o cualquier otro texto—mientras pueda representarse como una cadena de caracteres, su longitud no es realmente un problema. \n", "\n", - "Imagine we have four documents (i.e., the four phrases shown above), and we toss them all in the bag. Instead of a word-frequency list, we'd expect a document-term matrix (DTM) in return. In a DTM, the word list is the **vocabulary** (V) that holds all unique words occur across the documents. For each **document** (D), we count the number of occurence of each word in the vocabulary, and then plug the number into the matrix. In other words, the DTM we will construct is a $D \\times V$ matrix, where each row corresponds to a document, and each column corresponds to a token (or \"term\").\n", + "Imagina que tenemos cuatro documentos (es decir, las cuatro frases mostradas anteriormente) y los arrojamos todos en la bolsa. En lugar de obtener una lista de frecuencias de palabras, obtendremos una document-term matrix (DTM). En una DTM, la lista de palabras constituye el **vocabulary** (V), que contiene todas las palabras únicas que aparecen en los documentos. Para cada **document** (D), contamos la cantidad de veces que aparece cada palabra en el vocabulario y luego colocamos ese número en la matriz. En otras palabras, la DTM que construiremos es una matriz $D \\times V$, donde cada fila corresponde a un documento y cada columna a un token (o \"término\"). \n", "\n", - "The unique tokens in this set of documents, arranged in alphabetical order, form the columns. For each document, we mark the occurence of each word present in the document. The numerical representation for each document is a row in the matrix. For example, the first document, \"the coffee roaster,\" has the numerical representation $[0, 1, 0, 0, 0, 1, 1, 0]$.\n", + "Los tokens únicos en este conjunto de documentos, organizados en orden alfabético, forman las columnas. Para cada documento, marcamos la frecuencia de cada palabra presente en el documento. La representación numérica de cada documento es una fila en la matriz. Por ejemplo, el primer documento, \"the coffee roaster\", tiene la representación numérica $[0, 1, 0, 0, 0, 1, 1, 0]$. \n", "\n", - "Note that the left index column now displays these documents as text, but typically we would just assign an index to each of them. \n", + "Nota que la columna de índices a la izquierda muestra estos documentos como texto, pero típicamente solo se les asignaría un número de índice. \n", "\n", "$$\n", "\\begin{array}{c|cccccccccccc}\n", @@ -725,12 +736,12 @@ "\\end{array}\n", "$$\n", "\n", - "To create a DTM, we will use `CountVectorizer` from the package `sklearn`." + "Para crear una DTM, utilizaremos `CountVectorizer` del paquete `sklearn`.\n" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 1, "id": "cd2adf56-ba93-459d-8cfa-16ce8dc9284b", "metadata": {}, "outputs": [], @@ -743,11 +754,11 @@ "id": "4989781d-6b40-417a-be70-eeba05cd8a50", "metadata": {}, "source": [ - "The following illustration depicts the three-step workflow of creating a DTM with `CountVectorizr`.\n", + "La siguiente ilustración muestra el flujo de trabajo en tres pasos para crear una DTM con `CountVectorizer`.\n", "\n", "\"CountVectorizer\"\n", "\n", - "Let's walk through these steps with the toy example shown above." + "Repasemos estos pasos utilizando el ejemplo simple mostrado anteriormente." ] }, { @@ -755,12 +766,12 @@ "id": "34174034-46b9-43e2-a511-5972d378cb00", "metadata": {}, "source": [ - "### A Toy Example" + "### Un Ejemplo Sencillo\n" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 2, "id": "4da2bd3d-0460-4b5f-9b9e-02940db0d7ca", "metadata": {}, "outputs": [], @@ -777,14 +788,14 @@ "id": "dff7c1d3-fcee-4e20-b9a7-17306ebd5fc2", "metadata": {}, "source": [ - "The first step is to initialize a `CountVectorizer` object. Within the round paratheses, we can specify parameter settings if desired. Let's take a look at the [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) and see what options are available. \n", + "El primer paso es inicializar un objeto `CountVectorizer`. Dentro de los paréntesis, podemos especificar parámetros de configuración si lo deseamos. Echemos un vistazo a la [documentación](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) para ver qué opciones están disponibles. \n", "\n", - "For now we can just leave it blank to use the default settings. " + "Por ahora, podemos dejarlo en blanco para usar la configuración predeterminada. " ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 3, "id": "9de3fe6a-9abf-4e11-aad1-e54c891567bb", "metadata": {}, "outputs": [], @@ -798,14 +809,14 @@ "id": "1b5a7d0d-0bfc-4fb9-8e5f-e91e39797fb5", "metadata": {}, "source": [ - "The second step is to `fit` this `CountVectorizer` object to the data, which means creating a vocabulary of tokens from the set of documents. Thirdly, we `transform` our data according to the \"fitted\" `CountVectorizer` object, which means taking each of the document and counting the occurrences of tokens according to the vocabulary established during the \"fitting\" step.\n", + "El segundo paso es aplicar `fit` al objeto `CountVectorizer` con los datos, lo que significa crear un vocabulario de tokens a partir del conjunto de documentos. Luego, en el tercer paso, usamos `transform` para procesar nuestros datos de acuerdo con el objeto `CountVectorizer` \"ajustado\". Esto implica tomar cada documento y contar la aparición de tokens según el vocabulario establecido durante el paso de \"ajuste\". \n", "\n", - "It may sound a bit complex but steps 2 and 3 can be done in one swoop using a `fit_transform` function." + "Puede sonar un poco complejo, pero los pasos 2 y 3 pueden realizarse en una sola operación utilizando la función `fit_transform`. " ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 4, "id": "da1bbad4-bb1a-4b92-9096-6e17558b4a42", "metadata": {}, "outputs": [], @@ -819,25 +830,25 @@ "id": "324d3b65-4e98-48bf-87d2-399457f4939c", "metadata": {}, "source": [ - "The return of `fit_transform` is supposed to be the DTM. \n", + "El resultado de `fit_transform` debería ser la DTM. \n", "\n", - "Let's take a look at it!" + "¡Echemos un vistazo! " ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 5, "id": "cb044001-8eb2-4489-b025-2d8e2d4bfee2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "<4x8 sparse matrix of type ''\n", - "\twith 9 stored elements in Compressed Sparse Row format>" + "" ] }, - "execution_count": 17, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -851,14 +862,14 @@ "id": "f9817b09-a806-42c4-9436-822cc27a38b9", "metadata": {}, "source": [ - "Apparently we've got a \"sparse matrix\"—a matrix that contains a lot of zeros. This makes sense. For each document, there are words that don't occur at all, and these are counted as zero in the DTM. This sparse matrix is stored in a \"Compressed Sparse Row\" format, a memory-saving format designed for handling sparse matrices. \n", + "Aparentemente, hemos obtenido una \"sparse matrix\", es decir, una matriz que contiene muchos ceros. Esto tiene sentido: en cada documento, hay palabras que no aparecen en absoluto, y estas se registran como ceros en la DTM. Esta matriz dispersa se almacena en un formato \"Compressed Sparse Row\", un formato optimizado para ahorrar memoria al manejar matrices dispersas. \n", "\n", - "Let's convert it to a dense matrix, where those zeros are probably represented, as in a numpy array." + "Convirtámosla en una matriz densa, donde esos ceros probablemente estén representados, como en un array de numpy. " ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 6, "id": "bb03a238-87d8-40c9-b20e-66e7c9b6576b", "metadata": {}, "outputs": [ @@ -871,7 +882,7 @@ " [0, 1, 0, 0, 0, 0, 0, 1]])" ] }, - "execution_count": 18, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -886,12 +897,12 @@ "id": "28b58a63-d7f6-4b9f-aadf-4d4fc7341336", "metadata": {}, "source": [ - "So this is our DTM! The matrix is the same as shown above. To make it more reader-friendly, let's convert it to a dataframe. The column names should be tokens in the vocabulary, which we can access with the `get_feature_names_out` function." + "¡Así que esta es nuestra DTM! La matriz es la misma que mostramos anteriormente. Para hacerla más fácil de leer, convirtámosla en un dataframe. Los nombres de las columnas deben ser los tokens del vocabulario, a los cuales podemos acceder con la función `get_feature_names_out`. " ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 7, "id": "714de5d3-e37d-4a19-9ade-3c6629e38d4e", "metadata": {}, "outputs": [ @@ -902,7 +913,7 @@ " 'time'], dtype=object)" ] }, - "execution_count": 19, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -914,7 +925,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 10, "id": "6a7729a2-ca2e-4de7-8795-74dfedb7a4d5", "metadata": {}, "outputs": [], @@ -929,12 +940,12 @@ "id": "781da407-f394-40f2-9d45-1fac39f02047", "metadata": {}, "source": [ - "Here it is! The DTM of our toy data is now a dataframe. The index of `test_dtm` corresponds to the position of each document in the `test` list. " + "¡Aquí está! La DTM de nuestros datos de ejemplo ahora es un dataframe. El índice de `test_dtm` corresponde a la posición de cada documento en la lista `test`. " ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 11, "id": "e41dd243-cd2e-43c3-80f8-5eaab6e64210", "metadata": {}, "outputs": [ @@ -1026,7 +1037,7 @@ "3 0 1 0 0 0 0 0 1" ] }, - "execution_count": 21, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -1040,20 +1051,20 @@ "id": "d59a03b4-94fa-4fe7-8f5d-7280e31b9bc4", "metadata": {}, "source": [ - "Hopefully this toy example provides a clear walkthrough of creating a DTM.\n", + "Esperamos que este ejemplo sencillo haya proporcionado una guía clara para crear una DTM.\n", "\n", - "Now it's time for our tweets data!\n", + "¡Ahora es el momento de trabajar con nuestros datos de tweets!\n", "\n", - "### DTM for Tweets\n", + "### DTM para Tweets\n", "\n", - "We'll begin by initializing a `CountVectorizer` object. In the following cell, we have included a few parameters that people often adjust. These parameters are currently set to their default values.\n", + "Comenzaremos inicializando un objeto `CountVectorizer`. En la siguiente celda, hemos incluido algunos parámetros que las personas ajustan con frecuencia. Estos parámetros están configurados actualmente con sus valores predeterminados.\n", "\n", - "When we construct a DTM, the default is to lowercase the input text. If nothing is provided for `stop_words`, the default is to keep them. The next three parameters are used to control the size of the vocabulary, which we'll return to in a minute." + "Cuando construimos una DTM, el valor predeterminado es convertir a minúsculas el texto de entrada. Si no se proporciona nada para `stop_words`, el valor predeterminado es mantenerlas. Los siguientes tres parámetros se usan para controlar el tamaño del vocabulario, sobre lo cual volveremos en un momento." ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 12, "id": "783e44a4-4a22-4290-b222-282b02c080dc", "metadata": {}, "outputs": [], @@ -1068,22 +1079,27 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 29, "id": "f85e76ea-bc54-4775-bcda-432a03d2c96f", "metadata": { "scrolled": true }, "outputs": [ { - "data": { - "text/plain": [ - "<11541x8751 sparse matrix of type ''\n", - "\twith 191139 stored elements in Compressed Sparse Row format>" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" + "ename": "AttributeError", + "evalue": "'ellipsis' object has no attribute 'lower'", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mAttributeError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[29]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Fit and transform to create DTM\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m counts = \u001b[43mvectorizer\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtweets\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mtext_processed\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 3\u001b[39m counts\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.12/site-packages/sklearn/base.py:1389\u001b[39m, in \u001b[36m_fit_context..decorator..wrapper\u001b[39m\u001b[34m(estimator, *args, **kwargs)\u001b[39m\n\u001b[32m 1382\u001b[39m estimator._validate_params()\n\u001b[32m 1384\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[32m 1385\u001b[39m skip_parameter_validation=(\n\u001b[32m 1386\u001b[39m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[32m 1387\u001b[39m )\n\u001b[32m 1388\u001b[39m ):\n\u001b[32m-> \u001b[39m\u001b[32m1389\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.12/site-packages/sklearn/feature_extraction/text.py:1376\u001b[39m, in \u001b[36mCountVectorizer.fit_transform\u001b[39m\u001b[34m(self, raw_documents, y)\u001b[39m\n\u001b[32m 1368\u001b[39m warnings.warn(\n\u001b[32m 1369\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mUpper case characters found in\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 1370\u001b[39m \u001b[33m\"\u001b[39m\u001b[33m vocabulary while \u001b[39m\u001b[33m'\u001b[39m\u001b[33mlowercase\u001b[39m\u001b[33m'\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 1371\u001b[39m \u001b[33m\"\u001b[39m\u001b[33m is True. These entries will not\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 1372\u001b[39m \u001b[33m\"\u001b[39m\u001b[33m be matched with any documents\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 1373\u001b[39m )\n\u001b[32m 1374\u001b[39m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1376\u001b[39m vocabulary, X = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_count_vocab\u001b[49m\u001b[43m(\u001b[49m\u001b[43mraw_documents\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mfixed_vocabulary_\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1378\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.binary:\n\u001b[32m 1379\u001b[39m X.data.fill(\u001b[32m1\u001b[39m)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.12/site-packages/sklearn/feature_extraction/text.py:1263\u001b[39m, in \u001b[36mCountVectorizer._count_vocab\u001b[39m\u001b[34m(self, raw_documents, fixed_vocab)\u001b[39m\n\u001b[32m 1261\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m raw_documents:\n\u001b[32m 1262\u001b[39m feature_counter = {}\n\u001b[32m-> \u001b[39m\u001b[32m1263\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m feature \u001b[38;5;129;01min\u001b[39;00m \u001b[43manalyze\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[32m 1264\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m 1265\u001b[39m feature_idx = vocabulary[feature]\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.12/site-packages/sklearn/feature_extraction/text.py:104\u001b[39m, in \u001b[36m_analyze\u001b[39m\u001b[34m(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)\u001b[39m\n\u001b[32m 102\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 103\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m preprocessor \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m104\u001b[39m doc = \u001b[43mpreprocessor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 105\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m tokenizer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 106\u001b[39m doc = tokenizer(doc)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.12/site-packages/sklearn/feature_extraction/text.py:62\u001b[39m, in \u001b[36m_preprocess\u001b[39m\u001b[34m(doc, accent_function, lower)\u001b[39m\n\u001b[32m 43\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"Chain together an optional series of text preprocessing steps to\u001b[39;00m\n\u001b[32m 44\u001b[39m \u001b[33;03mapply to a document.\u001b[39;00m\n\u001b[32m 45\u001b[39m \n\u001b[32m (...)\u001b[39m\u001b[32m 59\u001b[39m \u001b[33;03m preprocessed string\u001b[39;00m\n\u001b[32m 60\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 61\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m lower:\n\u001b[32m---> \u001b[39m\u001b[32m62\u001b[39m doc = \u001b[43mdoc\u001b[49m\u001b[43m.\u001b[49m\u001b[43mlower\u001b[49m()\n\u001b[32m 63\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m accent_function \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 64\u001b[39m doc = accent_function(doc)\n", + "\u001b[31mAttributeError\u001b[39m: 'ellipsis' object has no attribute 'lower'" + ] } ], "source": [ @@ -1094,25 +1110,20 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 30, "id": "87119057-c78c-4eb2-a9d6-3e9f44e4c22b", "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "array([[0, 0, 0, ..., 0, 0, 0],\n", - " [0, 0, 0, ..., 0, 0, 0],\n", - " [0, 0, 0, ..., 0, 0, 0],\n", - " ...,\n", - " [0, 0, 0, ..., 0, 0, 0],\n", - " [0, 0, 0, ..., 0, 0, 0],\n", - " [0, 0, 0, ..., 0, 0, 0]])" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" + "ename": "NameError", + "evalue": "name 'counts' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[30]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Do not run if you have limited memory - this includes DataHub and Binder\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m np.array(\u001b[43mcounts\u001b[49m.todense())\n", + "\u001b[31mNameError\u001b[39m: name 'counts' is not defined" + ] } ], "source": [ @@ -1160,7 +1171,7 @@ "id": "2dd257d5-4244-436c-afe7-5688232caf8f", "metadata": {}, "source": [ - "If we leave the `CountVectorizer` to the default setting, the vocabulary size of the tweet data is 8751. " + "Si dejamos el `CountVectorizer` con la configuración predeterminada, el tamaño del vocabulario de los datos de los tweets es 8751. " ] }, { @@ -1371,9 +1382,9 @@ "id": "095d34e2-52f8-4419-b4c7-ed20dbd5df89", "metadata": {}, "source": [ - "Most of the tokens have zero occurences at least in the first five tweets. \n", + "La mayoría de los tokens tienen cero ocurrencias, al menos en los primeros cinco tweets. \n", "\n", - "Let's take a closer look at the DTM!" + "¡Echemos un vistazo más de cerca a la DTM! " ] }, { @@ -1445,9 +1456,9 @@ "id": "5d230f79-e752-4e32-93db-4f013287f8e2", "metadata": {}, "source": [ - "It is not surprising to see \"user\" and \"digit\" to be among the most frequent tokens as we replaced each idiosyncratic one with these placeholders. The rest of the most frequent tokens are mostly stop words.\n", + "No es sorprendente ver que \"user\" y \"digit\" estén entre los tokens más frecuentes, ya que reemplazamos cada uno de los idiosincráticos con estos marcadores de posición. El resto de los tokens más frecuentes son principalmente palabras vacías (stop words).\n", "\n", - "Perhaps a more interesting pattern is to look for which token appears most in any given tweet:" + "Tal vez un patrón más interesante sea buscar qué token aparece más en cualquier tweet dado:" ] }, { @@ -1575,9 +1586,9 @@ "id": "7cdac4ef-6b9d-4aad-9b24-c70f6c2eb8f0", "metadata": {}, "source": [ - "It looks like among all tweets, at most a token appears six times, and it is either the word \"It\" or the word \"worst.\" \n", + "Parece que, entre todos los tweets, como máximo un token aparece seis veces, y es ya sea la palabra \"It\" o la palabra \"worst.\"\n", "\n", - "Let's go back to our tweets dataframe and locate the 918th tweet." + "Volvamos a nuestro dataframe de tweets y ubiquemos el tweet número 918." ] }, { @@ -1607,17 +1618,17 @@ "id": "3dba8e37-4880-4565-b6fc-7e7c96958f0f", "metadata": {}, "source": [ - "## Customize the `CountVectorizer`\n", + "## Personalizar el `CountVectorizer`\n", "\n", - "So far we've always used the default parameter setting to create our DTMs, but in many cases we may want to customize the `CountVectorizer` object. The purpose of doing so is to further filter out unnecessary tokens. In the example below, we tweak the following parameters:\n", + "Hasta ahora, siempre hemos utilizado la configuración predeterminada de parámetros para crear nuestras DTMs, pero en muchos casos, es posible que queramos personalizar el objeto `CountVectorizer`. El propósito de hacerlo es filtrar más a fondo los tokens innecesarios. En el ejemplo siguiente, ajustamos los siguientes parámetros:\n", "\n", - "- `stop_words = 'english'`: ignore English stop words \n", - "- `min_df = 2`: ignore words that don't occur at least twice\n", - "- `max_df = 0.95`: ignore words if they appear in more than 95\\% of the documents\n", + "- `stop_words = 'english'`: ignorar las palabras vacías en inglés\n", + "- `min_df = 2`: ignorar palabras que no ocurren al menos dos veces\n", + "- `max_df = 0.95`: ignorar palabras que aparecen en más del 95\\% de los documentos\n", "\n", - "🔔 **Question**: Let's pause for a minute to discuss whether it sounds reasonable to set these parameters! What do you think?\n", + "🔔 **Pregunta**: ¡Paremos un minuto para discutir si tiene sentido establecer estos parámetros! ¿Qué opinas?\n", "\n", - "Oftentimes, we are not interested in words whose frequencies are either too low or too high, so we use `min_df` and `max_df` to filter them out. Alternatively, we can define our vocabulary size as $N$ by setting `max_features`. In other words, we tell `CountVectorizer` to only consider the top $N$ most frequent tokens when constructing the DTM." + "A menudo, no estamos interesados en palabras cuya frecuencia es demasiado baja o demasiado alta, por lo que usamos `min_df` y `max_df` para filtrarlas. Alternativamente, podemos definir el tamaño de nuestro vocabulario como $N$ configurando `max_features`. En otras palabras, le decimos a `CountVectorizer` que solo considere los $N$ tokens más frecuentes al construir la DTM." ] }, { @@ -1657,7 +1668,7 @@ "id": "6d2e66bc-2eaa-4642-8848-74459948084b", "metadata": {}, "source": [ - "Our second DTM has a substantially smaller vocabulary compared to the first one." + "Nuestra segunda DTM tiene un vocabulario considerablemente más pequeño en comparación con la primera." ] }, { @@ -1888,7 +1899,7 @@ "id": "998fe2c3-ec90-4027-8c7f-417327a33a27", "metadata": {}, "source": [ - "The most frequent token list now includes words that make more sense to us, such as \"cancelled\" and \"service.\" " + "La lista de tokens más frecuentes ahora incluye palabras que tienen más sentido para nosotros, como \"cancelled\" y \"service.\"" ] }, { @@ -3350,7 +3361,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -3364,7 +3375,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.12.1" } }, "nbformat": 4, From 04792fa6e27fb5e48caa749d45e2c2eefd732200 Mon Sep 17 00:00:00 2001 From: juvizueteva Date: Wed, 26 Mar 2025 03:02:20 +0000 Subject: [PATCH 2/4] challenges realizados --- lessons/02_bag_of_words.ipynb | 442 ++++++++++++++++++---------------- 1 file changed, 235 insertions(+), 207 deletions(-) diff --git a/lessons/02_bag_of_words.ipynb b/lessons/02_bag_of_words.ipynb index 25868f6..67ebae3 100644 --- a/lessons/02_bag_of_words.ipynb +++ b/lessons/02_bag_of_words.ipynb @@ -48,7 +48,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 68, "id": "9e4a3a0d-66f4-44e5-8dd6-5f441146014d", "metadata": { "scrolled": true, @@ -62,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 69, "id": "21ed437f-9767-43b7-abc5-159aa4339a31", "metadata": {}, "outputs": [], @@ -75,7 +75,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 70, "id": "f3862ffd-918f-4184-8c90-8a39a8a2a069", "metadata": {}, "outputs": [], @@ -104,7 +104,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 71, "id": "4190e351-97b7-4c5b-866e-07aa6cbd42c2", "metadata": {}, "outputs": [], @@ -116,7 +116,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 72, "id": "79acbaf2-6625-4abb-b50f-97ea54ba0d11", "metadata": {}, "outputs": [ @@ -290,7 +290,7 @@ "4 2015-02-24 11:14:45 -0800 NaN Pacific Time (US & Canada) " ] }, - "execution_count": 20, + "execution_count": 72, "metadata": {}, "output_type": "execute_result" } @@ -316,7 +316,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 73, "id": "a1faaf90-8c01-4d25-9468-90c01823f0d5", "metadata": {}, "outputs": [], @@ -334,7 +334,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 74, "id": "438830e6-1064-47fe-b578-a1ca693a0ed0", "metadata": {}, "outputs": [ @@ -369,7 +369,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 75, "id": "01955158-6954-447a-acb6-2989d02a49c3", "metadata": {}, "outputs": [ @@ -404,7 +404,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 76, "id": "428ddde7-af73-4eb6-92c9-041a1791ca59", "metadata": {}, "outputs": [ @@ -417,7 +417,7 @@ "Name: retweet_count, dtype: float64" ] }, - "execution_count": 24, + "execution_count": 76, "metadata": {}, "output_type": "execute_result" } @@ -439,7 +439,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 77, "id": "12aa9f2d-d655-494a-bb72-08ad973518f3", "metadata": {}, "outputs": [ @@ -519,7 +519,7 @@ "Virgin America 0.543544 0.456456" ] }, - "execution_count": 25, + "execution_count": 77, "metadata": {}, "output_type": "execute_result" } @@ -581,30 +581,20 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 78, "id": "21738b02-9ab9-4a61-b41f-ff75888aa747", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/workspaces/Python-Text-Analysis_grupo_4/lessons/utils.py:4: SyntaxWarning: invalid escape sequence '\\d'\n", - " digit_pattern = '\\d+'\n", - "/workspaces/Python-Text-Analysis_grupo_4/lessons/utils.py:14: SyntaxWarning: invalid escape sequence '\\d'\n", - " digit_pattern = '\\d+'\n" - ] - } - ], + "outputs": [], "source": [ - "from utils import placeholder" + "from utils import placeholder\n", + "import re" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 79, "id": "03569f0d-34ba-492d-aa1d-1dce9d34f792", "metadata": {}, "outputs": [], @@ -615,21 +605,21 @@ "def preprocess(text):\n", " '''Create a preprocess pipeline that cleans the tweet data.'''\n", " \n", - " # Step 1: Lowercase\n", - " text = ...\n", - "\n", - " # Step 2: Replace patterns with placeholders\n", - " text = ...\n", - "\n", + " # Step 1: Convert text to lowercase\n", + " text = text.lower()\n", + " \n", + " # Step 2: Replace patterns with placeholders (URLs, digits, hashtags, user handles)\n", + " text = placeholder(text)\n", + " \n", " # Step 3: Remove extra whitespace characters\n", - " text = ...\n", - "\n", + " text = re.sub(blankspace_pattern, blankspace_repl, text)\n", + " \n", " return text" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 80, "id": "8990cefd-5d04-46ba-ada2-29978c28cfe8", "metadata": {}, "outputs": [ @@ -639,7 +629,7 @@ "text": [ "lol @justinbeiber and @BillGates are like soo 2000 #yesterday #amiright saw it on https://twitter.com #yolo\n", "==================================================\n", - "Ellipsis\n" + "lol USER and USER are like soo DIGIT HASHTAG HASHTAG saw it on URL HASHTAG \n" ] } ], @@ -656,7 +646,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 81, "id": "a5f7bb6a-f064-48cc-b650-12c4ef2fbb88", "metadata": { "scrolled": true @@ -665,15 +655,15 @@ { "data": { "text/plain": [ - "0 Ellipsis\n", - "1 Ellipsis\n", - "2 Ellipsis\n", - "3 Ellipsis\n", - "4 Ellipsis\n", + "0 USER plus you've added commercials to the exp...\n", + "1 USER it's really aggressive to blast obnoxiou...\n", + "2 USER and it's a really big bad thing about it\n", + "3 USER seriously would pay $ DIGIT a flight for...\n", + "4 USER yes, nearly every time i fly vx this “ea...\n", "Name: text_processed, dtype: object" ] }, - "execution_count": 28, + "execution_count": 81, "metadata": {}, "output_type": "execute_result" } @@ -741,7 +731,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 82, "id": "cd2adf56-ba93-459d-8cfa-16ce8dc9284b", "metadata": {}, "outputs": [], @@ -771,7 +761,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 83, "id": "4da2bd3d-0460-4b5f-9b9e-02940db0d7ca", "metadata": {}, "outputs": [], @@ -795,7 +785,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 84, "id": "9de3fe6a-9abf-4e11-aad1-e54c891567bb", "metadata": {}, "outputs": [], @@ -816,7 +806,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 85, "id": "da1bbad4-bb1a-4b92-9096-6e17558b4a42", "metadata": {}, "outputs": [], @@ -837,7 +827,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 86, "id": "cb044001-8eb2-4489-b025-2d8e2d4bfee2", "metadata": {}, "outputs": [ @@ -848,7 +838,7 @@ "\twith 9 stored elements and shape (4, 8)>" ] }, - "execution_count": 5, + "execution_count": 86, "metadata": {}, "output_type": "execute_result" } @@ -869,7 +859,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 87, "id": "bb03a238-87d8-40c9-b20e-66e7c9b6576b", "metadata": {}, "outputs": [ @@ -882,7 +872,7 @@ " [0, 1, 0, 0, 0, 0, 0, 1]])" ] }, - "execution_count": 6, + "execution_count": 87, "metadata": {}, "output_type": "execute_result" } @@ -902,7 +892,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 88, "id": "714de5d3-e37d-4a19-9ade-3c6629e38d4e", "metadata": {}, "outputs": [ @@ -913,7 +903,7 @@ " 'time'], dtype=object)" ] }, - "execution_count": 7, + "execution_count": 88, "metadata": {}, "output_type": "execute_result" } @@ -925,7 +915,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 89, "id": "6a7729a2-ca2e-4de7-8795-74dfedb7a4d5", "metadata": {}, "outputs": [], @@ -945,7 +935,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 90, "id": "e41dd243-cd2e-43c3-80f8-5eaab6e64210", "metadata": {}, "outputs": [ @@ -1037,7 +1027,7 @@ "3 0 1 0 0 0 0 0 1" ] }, - "execution_count": 11, + "execution_count": 90, "metadata": {}, "output_type": "execute_result" } @@ -1064,7 +1054,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 91, "id": "783e44a4-4a22-4290-b222-282b02c080dc", "metadata": {}, "outputs": [], @@ -1079,27 +1069,22 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 92, "id": "f85e76ea-bc54-4775-bcda-432a03d2c96f", "metadata": { "scrolled": true }, "outputs": [ { - "ename": "AttributeError", - "evalue": "'ellipsis' object has no attribute 'lower'", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mAttributeError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[29]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Fit and transform to create DTM\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m counts = \u001b[43mvectorizer\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtweets\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mtext_processed\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 3\u001b[39m counts\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.12/site-packages/sklearn/base.py:1389\u001b[39m, in \u001b[36m_fit_context..decorator..wrapper\u001b[39m\u001b[34m(estimator, *args, **kwargs)\u001b[39m\n\u001b[32m 1382\u001b[39m estimator._validate_params()\n\u001b[32m 1384\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[32m 1385\u001b[39m skip_parameter_validation=(\n\u001b[32m 1386\u001b[39m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[32m 1387\u001b[39m )\n\u001b[32m 1388\u001b[39m ):\n\u001b[32m-> \u001b[39m\u001b[32m1389\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.12/site-packages/sklearn/feature_extraction/text.py:1376\u001b[39m, in \u001b[36mCountVectorizer.fit_transform\u001b[39m\u001b[34m(self, raw_documents, y)\u001b[39m\n\u001b[32m 1368\u001b[39m warnings.warn(\n\u001b[32m 1369\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mUpper case characters found in\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 1370\u001b[39m \u001b[33m\"\u001b[39m\u001b[33m vocabulary while \u001b[39m\u001b[33m'\u001b[39m\u001b[33mlowercase\u001b[39m\u001b[33m'\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 1371\u001b[39m \u001b[33m\"\u001b[39m\u001b[33m is True. These entries will not\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 1372\u001b[39m \u001b[33m\"\u001b[39m\u001b[33m be matched with any documents\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 1373\u001b[39m )\n\u001b[32m 1374\u001b[39m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1376\u001b[39m vocabulary, X = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_count_vocab\u001b[49m\u001b[43m(\u001b[49m\u001b[43mraw_documents\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mfixed_vocabulary_\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1378\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.binary:\n\u001b[32m 1379\u001b[39m X.data.fill(\u001b[32m1\u001b[39m)\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.12/site-packages/sklearn/feature_extraction/text.py:1263\u001b[39m, in \u001b[36mCountVectorizer._count_vocab\u001b[39m\u001b[34m(self, raw_documents, fixed_vocab)\u001b[39m\n\u001b[32m 1261\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m raw_documents:\n\u001b[32m 1262\u001b[39m feature_counter = {}\n\u001b[32m-> \u001b[39m\u001b[32m1263\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m feature \u001b[38;5;129;01min\u001b[39;00m \u001b[43manalyze\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[32m 1264\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m 1265\u001b[39m feature_idx = vocabulary[feature]\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.12/site-packages/sklearn/feature_extraction/text.py:104\u001b[39m, in \u001b[36m_analyze\u001b[39m\u001b[34m(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)\u001b[39m\n\u001b[32m 102\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 103\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m preprocessor \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m104\u001b[39m doc = \u001b[43mpreprocessor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 105\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m tokenizer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 106\u001b[39m doc = tokenizer(doc)\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.12/site-packages/sklearn/feature_extraction/text.py:62\u001b[39m, in \u001b[36m_preprocess\u001b[39m\u001b[34m(doc, accent_function, lower)\u001b[39m\n\u001b[32m 43\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"Chain together an optional series of text preprocessing steps to\u001b[39;00m\n\u001b[32m 44\u001b[39m \u001b[33;03mapply to a document.\u001b[39;00m\n\u001b[32m 45\u001b[39m \n\u001b[32m (...)\u001b[39m\u001b[32m 59\u001b[39m \u001b[33;03m preprocessed string\u001b[39;00m\n\u001b[32m 60\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 61\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m lower:\n\u001b[32m---> \u001b[39m\u001b[32m62\u001b[39m doc = \u001b[43mdoc\u001b[49m\u001b[43m.\u001b[49m\u001b[43mlower\u001b[49m()\n\u001b[32m 63\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m accent_function \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 64\u001b[39m doc = accent_function(doc)\n", - "\u001b[31mAttributeError\u001b[39m: 'ellipsis' object has no attribute 'lower'" - ] + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -1110,20 +1095,25 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 93, "id": "87119057-c78c-4eb2-a9d6-3e9f44e4c22b", "metadata": {}, "outputs": [ { - "ename": "NameError", - "evalue": "name 'counts' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[30]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Do not run if you have limited memory - this includes DataHub and Binder\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m np.array(\u001b[43mcounts\u001b[49m.todense())\n", - "\u001b[31mNameError\u001b[39m: name 'counts' is not defined" - ] + "data": { + "text/plain": [ + "array([[0, 0, 0, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " ...,\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0]], shape=(11541, 8751))" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -1133,7 +1123,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 94, "id": "99322b85-1a15-46a5-bb80-bb5eaa6eeb7b", "metadata": {}, "outputs": [], @@ -1144,7 +1134,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 95, "id": "43620587-3795-4434-8f1f-145c81b93706", "metadata": {}, "outputs": [ @@ -1176,7 +1166,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 96, "id": "bb3604ec-d909-4238-9a3f-67e7d4ae2ac5", "metadata": {}, "outputs": [ @@ -1368,7 +1358,7 @@ "[5 rows x 8751 columns]" ] }, - "execution_count": 27, + "execution_count": 96, "metadata": {}, "output_type": "execute_result" } @@ -1389,7 +1379,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 97, "id": "f432154a-eae0-4723-a797-55f3cfdd71c4", "metadata": {}, "outputs": [ @@ -1409,7 +1399,7 @@ "dtype: int64" ] }, - "execution_count": 28, + "execution_count": 97, "metadata": {}, "output_type": "execute_result" } @@ -1421,27 +1411,27 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 98, "id": "26c7f1c9-dd66-49f2-b337-01253da551d2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "_exact_ 1\n", - "mightmismybrosgraduation 1\n", - "midterm 1\n", - "midnite 1\n", - "midland 1\n", - "michelle 1\n", - "michele 1\n", - "michael 1\n", - "mhtt 1\n", - "mgmt 1\n", + "zones 1\n", + "accelerate 1\n", + "acc 1\n", + "acarl 1\n", + "yogurt 1\n", + "yoga 1\n", + "yikes 1\n", + "absurdity 1\n", + "absorber 1\n", + "absorb 1\n", "dtype: int64" ] }, - "execution_count": 29, + "execution_count": 98, "metadata": {}, "output_type": "execute_result" } @@ -1463,7 +1453,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 99, "id": "efb8f4d8-4c88-4155-a6c5-c72a5b4e8bb8", "metadata": {}, "outputs": [ @@ -1504,42 +1494,42 @@ " 6\n", " \n", " \n", - " 10572\n", + " 11007\n", " to\n", " 5\n", " \n", " \n", - " 8148\n", - " the\n", + " 5513\n", + " to\n", " 5\n", " \n", " \n", - " 10742\n", + " 7750\n", " to\n", " 5\n", " \n", " \n", - " 152\n", - " to\n", + " 10923\n", + " the\n", " 5\n", " \n", " \n", - " 5005\n", + " 4089\n", " to\n", " 5\n", " \n", " \n", - " 10923\n", - " the\n", + " 8134\n", + " to\n", " 5\n", " \n", " \n", - " 7750\n", - " to\n", + " 8148\n", + " the\n", " 5\n", " \n", " \n", - " 355\n", + " 557\n", " to\n", " 5\n", " \n", @@ -1551,17 +1541,17 @@ " token number\n", "3127 lt 6\n", "918 worst 6\n", - "10572 to 5\n", - "8148 the 5\n", - "10742 to 5\n", - "152 to 5\n", - "5005 to 5\n", - "10923 the 5\n", + "11007 to 5\n", + "5513 to 5\n", "7750 to 5\n", - "355 to 5" + "10923 the 5\n", + "4089 to 5\n", + "8134 to 5\n", + "8148 the 5\n", + "557 to 5" ] }, - "execution_count": 30, + "execution_count": 99, "metadata": {}, "output_type": "execute_result" } @@ -1593,7 +1583,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 100, "id": "5e7cacd8-1fb3-4f0d-a744-4ee0994a089f", "metadata": {}, "outputs": [ @@ -1603,7 +1593,7 @@ "\"@united is the worst. Worst reservation policies. Worst costumer service. Worst worst worst. Congrats, @Delta you're not that bad!\"" ] }, - "execution_count": 31, + "execution_count": 100, "metadata": {}, "output_type": "execute_result" } @@ -1633,7 +1623,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 101, "id": "37a0a93e-9dd8-43dc-a82c-06a24bf02bc9", "metadata": {}, "outputs": [], @@ -1648,7 +1638,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 102, "id": "b53e5ecf-7be3-4915-9d11-fd3edb913400", "metadata": {}, "outputs": [], @@ -1673,7 +1663,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 103, "id": "570fb598-fa81-4111-9e36-7172d8034713", "metadata": {}, "outputs": [ @@ -1693,7 +1683,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 104, "id": "d8deabb2-20eb-4047-b592-48cb1564fd2a", "metadata": {}, "outputs": [ @@ -1885,7 +1875,7 @@ "[5 rows x 4471 columns]" ] }, - "execution_count": 35, + "execution_count": 104, "metadata": {}, "output_type": "execute_result" } @@ -1904,7 +1894,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 105, "id": "ffa7bf4e-640b-49bc-b64b-721140f67f76", "metadata": {}, "outputs": [ @@ -1924,7 +1914,7 @@ "dtype: int64" ] }, - "execution_count": 36, + "execution_count": 105, "metadata": {}, "output_type": "execute_result" } @@ -1956,19 +1946,23 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 106, "id": "da610560-62c3-48ab-a1b2-25e0b589bc61", "metadata": {}, "outputs": [], "source": [ "# Import spaCy\n", "import spacy\n", + "import pandas as pd\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "\n", + "# Cargar el modelo de spaCy\n", "nlp = spacy.load('en_core_web_sm')" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 107, "id": "98ead266-30f3-48ad-bc51-c1685487f000", "metadata": { "scrolled": true @@ -1978,17 +1972,17 @@ "# Create a function to lemmatize text\n", "def lemmatize_text(text):\n", " '''Lemmatize the text input with spaCy annotations.'''\n", - "\n", + " \n", " # Step 1: Initialize an empty list to hold lemmas\n", - " lemma = ...\n", - "\n", + " lemma = []\n", + " \n", " # Step 2: Apply the nlp pipeline to input text\n", - " doc = ...\n", - "\n", + " doc = nlp(text)\n", + " \n", " # Step 3: Iterate over tokens in the text to get the token lemma\n", " for token in doc:\n", - " lemma.append(...)\n", - "\n", + " lemma.append(token.lemma_)\n", + " \n", " # Step 4: Join lemmas together into a single string\n", " text_lemma = ' '.join(lemma)\n", " \n", @@ -2005,7 +1999,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 108, "id": "742e82bb-5c42-4fa8-9101-5a0ea908db25", "metadata": {}, "outputs": [ @@ -2013,9 +2007,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "USER wow this just blew my mind\n", + " USER wow this just blew my mind\n", "==================================================\n", - "USER wow this just blow my mind\n" + " USER wow this just blow my mind\n" ] } ], @@ -2036,7 +2030,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 109, "id": "1ac128d2-1be5-4ef5-bb50-5b8d44ef8ee9", "metadata": {}, "outputs": [], @@ -2055,7 +2049,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 110, "id": "5f49d790-3c9d-4dc1-a5c9-72c306630412", "metadata": {}, "outputs": [ @@ -2226,7 +2220,7 @@ " \n", " \n", "\n", - "

5 rows × 3553 columns

\n", + "

5 rows × 3571 columns

\n", "" ], "text/plain": [ @@ -2244,10 +2238,10 @@ "3 0 0 0 0 0 0 0 0 0 0 \n", "4 0 0 0 0 0 0 0 0 0 0 \n", "\n", - "[5 rows x 3553 columns]" + "[5 rows x 3571 columns]" ] }, - "execution_count": 41, + "execution_count": 110, "metadata": {}, "output_type": "execute_result" } @@ -2273,7 +2267,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 111, "id": "9859eb04-dbd2-4fa0-9798-65ed7496c297", "metadata": {}, "outputs": [ @@ -2283,7 +2277,7 @@ "text": [ "(11541, 8751)\n", "(11541, 4471)\n", - "(11541, 3553)\n" + "(11541, 3571)\n" ] } ], @@ -2304,7 +2298,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 112, "id": "5745ca29-97ed-4fe1-81db-7e402c8da674", "metadata": {}, "outputs": [ @@ -2312,19 +2306,19 @@ "data": { "text/plain": [ "digit 6927\n", - "flight 4043\n", + "flight 3952\n", "hashtag 2633\n", - "thank 1455\n", + "thank 1454\n", "hour 1134\n", - "cancel 948\n", - "delay 937\n", - "service 937\n", + "cancel 951\n", + "service 939\n", + "delay 934\n", "customer 902\n", - "time 856\n", + "time 860\n", "dtype: int64" ] }, - "execution_count": 43, + "execution_count": 112, "metadata": {}, "output_type": "execute_result" } @@ -2336,7 +2330,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 113, "id": "16c63e6a-50c3-448a-9a56-a1d193cd6680", "metadata": {}, "outputs": [ @@ -2356,7 +2350,7 @@ "dtype: int64" ] }, - "execution_count": 44, + "execution_count": 113, "metadata": {}, "output_type": "execute_result" } @@ -2388,7 +2382,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 114, "id": "f5e32d8a-c42d-475f-aab4-21eca8b1aee8", "metadata": {}, "outputs": [], @@ -2398,7 +2392,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 115, "id": "d23916c1-5693-456c-b71d-6d9d78d1e2e4", "metadata": {}, "outputs": [], @@ -2413,18 +2407,18 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 116, "id": "7af5b342-ab18-4766-9561-e38e50cd1e9b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "<11541x3553 sparse matrix of type ''\n", - "\twith 88287 stored elements in Compressed Sparse Row format>" + "" ] }, - "execution_count": 47, + "execution_count": 116, "metadata": {}, "output_type": "execute_result" } @@ -2437,7 +2431,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 117, "id": "55e509c8-5402-4be0-9143-0e448fff7066", "metadata": {}, "outputs": [ @@ -2608,7 +2602,7 @@ " \n", " \n", "\n", - "

5 rows × 3553 columns

\n", + "

5 rows × 3571 columns

\n", "" ], "text/plain": [ @@ -2626,10 +2620,10 @@ "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "\n", - "[5 rows x 3553 columns]" + "[5 rows x 3571 columns]" ] }, - "execution_count": 48, + "execution_count": 117, "metadata": {}, "output_type": "execute_result" } @@ -2668,7 +2662,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 118, "id": "995b511a-d448-4cfb-a6a0-22a465efd8a8", "metadata": {}, "outputs": [ @@ -2686,10 +2680,10 @@ "zone 3177\n", "zoom 3920\n", "zurich 10622\n", - "Length: 3553, dtype: int64" + "Length: 3571, dtype: int64" ] }, - "execution_count": 49, + "execution_count": 118, "metadata": {}, "output_type": "execute_result" } @@ -2709,17 +2703,17 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 119, "id": "09b222fb-ad8c-4767-a974-dd261370a06e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "918" + "np.int64(918)" ] }, - "execution_count": 50, + "execution_count": 119, "metadata": {}, "output_type": "execute_result" } @@ -2738,17 +2732,17 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 120, "id": "079ee0e0-476f-4236-ba8a-615ba7a0efe8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "\"USER is the worst. worst reservation policies. worst costumer service. worst worst worst. congrats, USER you're not that bad!\"" + "\" USER is the worst. worst reservation policies. worst costumer service. worst worst worst. congrats, USER you're not that bad!\"" ] }, - "execution_count": 51, + "execution_count": 120, "metadata": {}, "output_type": "execute_result" } @@ -2767,17 +2761,17 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 121, "id": "f809df1a-1178-4272-a415-42edb20173b2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "5945" + "np.int64(5945)" ] }, - "execution_count": 52, + "execution_count": 121, "metadata": {}, "output_type": "execute_result" } @@ -2788,17 +2782,17 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 122, "id": "8093b6a7-54ca-468a-9376-b3c0be0b6f9b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'USER cancelled flighted 😢'" + "' USER cancelled flighted 😢'" ] }, - "execution_count": 53, + "execution_count": 122, "metadata": {}, "output_type": "execute_result" } @@ -2831,34 +2825,45 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 123, "id": "2bfbf838-9ff6-48b8-ad5d-5e75304fe060", "metadata": {}, "outputs": [], "source": [ "# Complete the boolean masks \n", - "positive_index = tweets[...].index\n", - "negative_index = tweets[...].index" + "positive_index = tweets[tweets['airline_sentiment'] == 'positive'].index\n", + "negative_index = tweets[tweets['airline_sentiment'] == 'negative'].index" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 124, "id": "8c67ea1f-de9e-49a9-94f2-a3351446e364", "metadata": {}, "outputs": [], "source": [ "# Complete the following two lines\n", - "pos = tfidf.loc[...].mean().sort_values(...).head(...)\n", - "neg = tfidf.loc[...].mean().sort_values(...).head(...)" + "pos = tfidf.loc[positive_index].mean().sort_values(ascending=False).head(10)\n", + "neg = tfidf.loc[negative_index].mean().sort_values(ascending=False).head(10)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 125, "id": "f1e29043-8c78-4e41-81d2-b4552030b457", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "pos.plot(kind='barh', \n", " xlim=(0, 0.18),\n", @@ -2868,10 +2873,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 126, "id": "e8b25940-2372-4755-818e-f75e4d23daf9", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "neg.plot(kind='barh', \n", " xlim=(0, 0.18),\n", @@ -2915,7 +2931,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 127, "id": "33413d63-87eb-489f-b374-3cfeaa51cf3c", "metadata": {}, "outputs": [], @@ -2934,7 +2950,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 128, "id": "64cec8b9-14d9-4897-9c02-cc89fcf7b3c6", "metadata": {}, "outputs": [], @@ -2955,10 +2971,22 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": null, "id": "d46de0b2-af00-4a1d-b4cd-31b96ce545d1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31mEl kernel se bloqueó al ejecutar código en la celda actual o en una celda anterior. \n", + "\u001b[1;31mRevise el código de las celdas para identificar una posible causa del error. \n", + "\u001b[1;31mHaga clic aquí para obtener más información. \n", + "\u001b[1;31mVea Jupyter log para obtener más detalles." + ] + } + ], "source": [ "def fit_logistic_regression(X, y):\n", " '''Fits a logistic regression model to provided data.'''\n", @@ -2982,7 +3010,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": null, "id": "773963bd-6603-4fad-884b-09ce60afab18", "metadata": {}, "outputs": [], @@ -2993,7 +3021,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": null, "id": "e10d06c1-d884-45d4-a03d-dd5d40bf70aa", "metadata": {}, "outputs": [ @@ -3032,7 +3060,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": null, "id": "6dcb6ef1-13b3-437e-813c-7118911847a4", "metadata": {}, "outputs": [], @@ -3051,7 +3079,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": null, "id": "3e63814e-9c0d-4f7a-a5e0-72cca2758d71", "metadata": {}, "outputs": [ @@ -3162,7 +3190,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": null, "id": "0d596bf7-753c-40cd-ac52-4a37163650ae", "metadata": {}, "outputs": [ @@ -3281,7 +3309,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": null, "id": "17b1223b-e5c1-4992-bb7e-0a99651c3729", "metadata": {}, "outputs": [ @@ -3308,7 +3336,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": null, "id": "159e00c6-8a9f-484f-aea2-853fd5512083", "metadata": {}, "outputs": [ From 74bc18ddbf058f11e60411e7229057705f16a6d4 Mon Sep 17 00:00:00 2001 From: juvizueteva Date: Wed, 26 Mar 2025 04:28:41 +0000 Subject: [PATCH 3/4] traduccion remove spaces --- lessons/01_preprocessing.ipynb | 38 +++++++++++++++++----------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/lessons/01_preprocessing.ipynb b/lessons/01_preprocessing.ipynb index de33786..88ff85f 100644 --- a/lessons/01_preprocessing.ipynb +++ b/lessons/01_preprocessing.ipynb @@ -427,15 +427,15 @@ "id": "7bf0d8c8-bd6c-47ef-b305-09ac61d07d4d", "metadata": {}, "source": [ - "### Remove Extra Whitespace Characters\n", + "### Eliminar Caracteres de Espaciado Extra\n", "\n", - "Sometimes we might come across texts with extraneous whitespace, such as spaces, tabs, and newline characters, which is particularly common when the text is scrapped from web pages. Before we dive into the details, let's briefly introduce Regular Expressions (regex) and the `re` package. \n", + "A veces nos encontramos con textos que contienen espacios en blanco innecesarios, como espacios, tabulaciones y caracteres de nueva línea, lo cual es particularmente común cuando el texto proviene de páginas web. Antes de profundizar en los detalles, presentemos brevemente las Expresiones Regulares (regex) y el paquete `re`. \n", "\n", - "Regular expressions are a powerful way of searching for specific string patterns in large corpora. They have an infamously steep learning curve, but they can be very efficient when we get a handle on them. Many NLP packages heavily rely on regex under the hood. Regex testers, such as [regex101](https://regex101.com), are useful tools in both understanding and creating regex expressions.\n", + "Las expresiones regulares son una forma poderosa de buscar patrones específicos de cadenas en grandes corpus de texto. Tienen una curva de aprendizaje notoriamente empinada, pero pueden ser muy eficientes cuando logramos dominarlas. Muchos paquetes de procesamiento de lenguaje natural (NLP) dependen en gran medida de las expresiones regulares. Los probadores de regex, como [regex101](https://regex101.com), son herramientas útiles tanto para entender como para crear expresiones regulares.\n", "\n", - "Our goal in this workshop is not to provide a deep (or even shallow) dive into regex; instead, we want to expose you to them so that you are better prepared to do deep dives in the future!\n", + "Nuestro objetivo en este taller no es ofrecer una inmersión profunda (ni siquiera superficial) en regex; en su lugar, queremos exponerlos a ellas para que estén mejor preparados para hacer inmersiones más profundas en el futuro.\n", "\n", - "The following example is a poem by William Wordsworth. Like many poems, the text may contain extra line breaks (i.e., newline characters, `\\n`) that we want to remove." + "El siguiente ejemplo es un poema de William Wordsworth. Como muchos poemas, el texto puede contener saltos de línea extra (es decir, caracteres de nueva línea, `\\n`) que queremos eliminar." ] }, { @@ -459,7 +459,7 @@ "id": "7a693dd9-9706-40b3-863f-f568020245f7", "metadata": {}, "source": [ - "As you can see, the poem is formatted as a continuous string of text with line breaks placed at the end of each line, making it difficult to read. " + "Como pueden ver, el poema está formateado como una cadena continua de texto con saltos de línea al final de cada línea, lo que lo hace difícil de leer." ] }, { @@ -488,7 +488,7 @@ "id": "47cce993-c315-4aaa-87fe-149de8607f65", "metadata": {}, "source": [ - "One handy function we can use to display the poem properly is `.splitlines()`. As the name suggests, it splits a long text sequence into a list of lines whenever there is a newline character. " + "Una función útil que podemos usar para mostrar el poema correctamente es `.splitlines()`. Como su nombre indica, divide una secuencia de texto larga en una lista de líneas cada vez que encuentra un carácter de nueva línea." ] }, { @@ -547,7 +547,7 @@ "id": "44d3825b-0857-44e1-bf6a-d8c7a9032704", "metadata": {}, "source": [ - "Let's return to our tweet data for an example." + "Volvamos a nuestros datos de tweets para un ejemplo.\n" ] }, { @@ -578,9 +578,9 @@ "id": "aef55865-36fd-4c06-a765-530cf3b53096", "metadata": {}, "source": [ - "In this case, we don't really want to split the tweet into a list of strings. We still expect a single string of text but would like to remove the line break completely from the string.\n", + "En este caso, realmente no queremos dividir el tweet en una lista de cadenas. Aún esperamos una sola cadena de texto, pero nos gustaría eliminar el salto de línea completamente de la cadena.\n", "\n", - "The string method `.strip()` effectively does the job of stripping away spaces at both ends of the text. However, it won't work in our example as the newline character is in the middle of the string." + "El método de cadenas `.strip()` hace eficazmente el trabajo de eliminar los espacios al principio y al final del texto. Sin embargo, no funcionará en nuestro ejemplo, ya que el carácter de nueva línea está en el medio de la cadena." ] }, { @@ -610,7 +610,7 @@ "id": "b99b80b4-804f-460f-a2d5-adbd654902b3", "metadata": {}, "source": [ - "This is where regex could be really helpful." + "Aquí es donde las expresiones regulares (regex) podrían ser realmente útiles." ] }, { @@ -628,13 +628,13 @@ "id": "d5f08d20-ba81-4e48-9e2a-5728148005b3", "metadata": {}, "source": [ - "Now, with regex, we are essentially calling it to match a pattern that we have identified in the text data, and we want to do some operations to the matched part—extract it, replace it with something else, or remove it completely. Therefore, the way regex works could be unpacked into the following steps:\n", + "Ahora, con las expresiones regulares (regex), esencialmente la estamos llamando para que coincida con un patrón que hemos identificado en los datos de texto, y queremos realizar algunas operaciones sobre la parte coincidente: extraerla, reemplazarla por algo más o eliminarla por completo. Por lo tanto, el funcionamiento de regex podría desglosarse en los siguientes pasos:\n", "\n", - "- Identify and write the pattern in regex (`r'PATTERN'`)\n", - "- Write the replacement for the pattern (`'REPLACEMENT'`)\n", - "- Call the specific regex function (e.g., `re.sub()`)\n", + "- Identificar y escribir el patrón en regex (`r'PATTERN'`)\n", + "- Escribir el reemplazo para el patrón (`'REPLACEMENT'`)\n", + "- Llamar a la función específica de regex (por ejemplo, `re.sub()`)\n", "\n", - "In our example, the pattern we are looking for is `\\s`, which is the regex short name for any whitespace character (`\\n` and `\\t` included). We also add a quantifier `+` to the end: `\\s+`. It means we'd like to capture one or more occurences of the whitespace character." + "En nuestro ejemplo, el patrón que estamos buscando es `\\s`, que es el nombre corto de regex para cualquier carácter de espacio en blanco (incluidos `\\n` y `\\t`). También agregamos un cuantificador `+` al final: `\\s+`. Esto significa que nos gustaría capturar una o más ocurrencias del carácter de espacio en blanco." ] }, { @@ -653,7 +653,7 @@ "id": "cc075c2e-1a1d-4393-a3ea-8ad7c118364b", "metadata": {}, "source": [ - "The replacement for one or more whitespace characters is exactly one single whitespace, which is the canonical word boundary in English. Any additional whitespace will be reduced to a single whitespace. " + "El reemplazo para uno o más caracteres de espacio en blanco es exactamente un solo espacio, que es el límite de palabra canónico en inglés. Cualquier espacio adicional se reducirá a un solo espacio." ] }, { @@ -672,7 +672,7 @@ "id": "bc12e3d1-728a-429b-9c83-4dcc88590bc4", "metadata": {}, "source": [ - "Lastly, let's put everything together using the function [`re.sub()`](https://docs.python.org/3.11/library/re.html#re.sub), which means we want to substitute a pattern with a replacement. The function takes in three arguments—the pattern, the replacement, and the string to which we want to apply the function." + "Finalmente, pongamos todo junto usando la función [`re.sub()`](https://docs.python.org/3.11/library/re.html#re.sub), lo que significa que queremos sustituir un patrón por un reemplazo. La función recibe tres argumentos: el patrón, el reemplazo y la cadena a la que queremos aplicar la función." ] }, { @@ -702,7 +702,7 @@ "id": "a895fbe3-a034-4124-94af-72a528913c51", "metadata": {}, "source": [ - "Ta-da! The newline character is no longer there." + "Ta-da! El carácter de nueva línea ya no está allí." ] }, { From eaeb81c0c58cd9a31c34657175559b1ef5d945b9 Mon Sep 17 00:00:00 2001 From: juvizueteva Date: Wed, 26 Mar 2025 04:41:43 +0000 Subject: [PATCH 4/4] traduccion hasta antes de tokeniz --- lessons/01_preprocessing.ipynb | 44 +++++++++++++++++----------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/lessons/01_preprocessing.ipynb b/lessons/01_preprocessing.ipynb index 88ff85f..a94dfdf 100644 --- a/lessons/01_preprocessing.ipynb +++ b/lessons/01_preprocessing.ipynb @@ -710,11 +710,11 @@ "id": "7087dc0c-5fef-4f1c-8662-7cbc8a978f34", "metadata": {}, "source": [ - "### Remove Punctuation Marks\n", + "### Eliminar los Signos de Puntuación\n", "\n", - "Sometimes we are only interested in analyzing **alphanumeric characters** (i.e., the letters and numbers), in which case we might want to remove punctuation marks. \n", + "A veces solo estamos interesados en analizar **caracteres alfanuméricos** (es decir, las letras y los números), en cuyo caso podríamos querer eliminar los signos de puntuación.\n", "\n", - "The `string` module contains a list of predefined punctuation marks. Let's print them out." + "El módulo `string` contiene una lista de signos de puntuación predefinidos. Vamos a imprimirlos." ] }, { @@ -742,7 +742,7 @@ "id": "91119c9e-431c-42cb-afea-f7e607698929", "metadata": {}, "source": [ - "In practice, to remove these punctuation characters, we can simply iterate over the text and remove characters found in the list, such as shown below in the `remove_punct` function." + "En la práctica, para eliminar estos caracteres de puntuación, podemos simplemente iterar sobre el texto y eliminar los caracteres que se encuentren en la lista, como se muestra a continuación en la función `remove_punct`.\n" ] }, { @@ -772,7 +772,7 @@ "id": "d4fc768b-c2dd-4386-8212-483c4485e4be", "metadata": {}, "source": [ - "Let's apply the function to the example below. " + "Aplicamos la función al siguiente ejemplo.\n" ] }, { @@ -815,7 +815,7 @@ "id": "853a4b83-f503-4405-aedd-66bbc088e3e7", "metadata": {}, "source": [ - "Let's give it a try with another tweet. What have you noticed?" + "Intentémoslo con otro tweet. ¿Qué has notado?\n" ] }, { @@ -857,7 +857,7 @@ "id": "1af02ce5-b674-4cb4-8e08-7d7416963f9c", "metadata": {}, "source": [ - "What about the following example?" + "¿Qué pasa con el siguiente ejemplo?\n" ] }, { @@ -890,7 +890,7 @@ "id": "62574c66-db3f-4500-9c3b-cea2f3eb2a30", "metadata": {}, "source": [ - "⚠️ **Warning:** In many cases, we want to remove punctuation marks **after** tokenization, which we will discuss in a minute. This tells us that the **order** of preprocessing is a matter of importance!" + "⚠️ **Advertencia:** En muchos casos, queremos eliminar los signos de puntuación **después** de la tokenización, lo cual discutiremos en un momento. ¡Esto nos dice que el **orden** del preprocesamiento es importante!\n" ] }, { @@ -898,16 +898,16 @@ "id": "58c6b85e-58e7-4f56-9b4a-b60c85b394ba", "metadata": {}, "source": [ - "## 🥊 Challenge 1: Preprocessing with Multiple Steps\n", + "## 🥊 Desafío 1: Preprocesamiento con Múltiples Pasos\n", "\n", - "So far we've learned a few preprocessing operations, let's put them together in a function! This function would be a handy one to refer to if you happen to work with some messy English text data, and you want to preprocess it with a single function. \n", + "Hasta ahora hemos aprendido algunas operaciones de preprocesamiento, ¡vamos a combinarlas en una función! Esta función sería útil para referirse a ella si alguna vez trabajas con datos de texto en inglés desordenados y deseas preprocesarlos con una sola función.\n", "\n", - "The example text data for challenge 1 is shown below. Write a function to:\n", - "- Lowercase the text\n", - "- Remove punctuation marks\n", - "- Remove extra whitespace characters\n", + "Los datos de texto de ejemplo para el desafío 1 se muestran a continuación. Escribe una función que:\n", + "- Ponga el texto en minúsculas\n", + "- Elimine los signos de puntuación\n", + "- Elimine los caracteres de espacio en blanco extra\n", "\n", - "Feel free to recycle the codes we've used above!" + "¡Siéntete libre de reutilizar los códigos que hemos usado anteriormente!\n" ] }, { @@ -986,11 +986,11 @@ "id": "67c159cb-8eaa-4c30-b8ff-38a712d2bb0f", "metadata": {}, "source": [ - "## Task-specific Processes\n", + "## Procesos Específicos para Tareas\n", "\n", - "Now that we understand common preprocessing operations, there are still a few additional operations to consider. Our text data might require further normalization depending on the language, source, and content of the data.\n", + "Ahora que entendemos las operaciones comunes de preprocesamiento, aún hay algunas operaciones adicionales a considerar. Nuestros datos de texto pueden requerir una mayor normalización dependiendo del idioma, la fuente y el contenido de los datos.\n", "\n", - "For example, if we are working with financial documents, we might want to standardize monetary symbols by converting them to digits. It our tweets data, there are numerous hashtags and URLs. These can be replaced with placeholders to simplify the subsequent analysis." + "Por ejemplo, si estamos trabajando con documentos financieros, podríamos querer estandarizar los símbolos monetarios convirtiéndolos en dígitos. En nuestros datos de tweets, hay numerosos hashtags y URLs. Estos pueden ser reemplazados por marcadores de posición para simplificar el análisis posterior.\n" ] }, { @@ -998,13 +998,13 @@ "id": "c2936cea-74e9-40c2-bfbe-6ba8129330de", "metadata": {}, "source": [ - "### 🎬 **Demo**: Remove Hashtags and URLs \n", + "### 🎬 **Demostración**: Eliminar Hashtags y URLs\n", "\n", - "Although URLs, hashtags, and numbers are informative in their own right, oftentimes we don't necessarily care about the exact meaning of each of them. \n", + "Aunque las URLs, los hashtags y los números son informativos por derecho propio, a menudo no nos importa necesariamente el significado exacto de cada uno de ellos.\n", "\n", - "While we could remove them completely, it's often informative to know that there **exists** a URL or a hashtag. In practice, we replace individual URLs and hashtags with a \"symbol\" that preserves the fact these structures exist in the text. It's standard to just use the strings \"URL\" and \"HASHTAG.\"\n", + "Si bien podríamos eliminarlos por completo, a menudo es informativo saber que **existe** una URL o un hashtag. En la práctica, reemplazamos las URLs y los hashtags individuales por un \"símbolo\" que conserva el hecho de que estas estructuras existen en el texto. Es común usar simplemente las cadenas \"URL\" y \"HASHTAG\".\n", "\n", - "Since these types of text often follow a regular structure, they're an apt case for using regular expressions. Let's apply these patterns to the tweets data." + "Dado que estos tipos de texto suelen seguir una estructura regular, son un buen caso para usar expresiones regulares. Apliquemos estos patrones a los datos de tweets.\n" ] }, {