leetcode-book
diff --git a/‎1-data-scraping.ipynb
Lines changed: 560 additions & 0 deletions b/‎1-data-scraping.ipynb
Lines changed: 560 additions & 0 deletions
diff --git a/‎2-pre-process.ipynb
Lines changed: 382 additions & 0 deletions b/‎2-pre-process.ipynb
Lines changed: 382 additions & 0 deletions
@@ -0,0 +1,382 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 2. Data pre processing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import re\n",
+    "import time\n",
+    "import json\n",
+    "\n",
+    "import requests as rq\n",
+    "import bs4 as bs4\n",
+    "import tqdm \n",
+    "import glob\n",
+    "import feather"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_json(\"parsed_videos.json\", lines=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list_of_links = df['link'].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/lucifer/DataScience/boss/lib/python3.6/site-packages/ipykernel_launcher.py:3: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0\n",
+      "Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`\n",
+      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e5a3b01a63464722ad786808e0601406",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(FloatProgress(value=0.0, max=1635.0), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "url = 'https://www.youtube.com{link}'\n",
+    "\n",
+    "for link in tqdm.tqdm_notebook(list_of_links):\n",
+    "    urll = url.format(link=link)\n",
+    "    response = rq.get(urll)\n",
+    "    \n",
+    "    link_name = re.search(\"v=(.*)\", link).group(1)\n",
+    "    \n",
+    "    with open(\"./raw_data/videos/video_{}.html\".format(link_name), 'w+') as output:\n",
+    "        output.write(response.text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/lucifer/DataScience/boss/lib/python3.6/site-packages/ipykernel_launcher.py:2: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0\n",
+      "Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`\n",
+      "  \n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f635bd8f344b4c2abe826656d341b4a6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(FloatProgress(value=0.0, max=1615.0), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "with open(\"parsed_videos_info.json\", 'w+') as output:\n",
+    "    for video in tqdm.tqdm_notebook(sorted(glob.glob(\"./raw_data/videos/*\"))):\n",
+    "        with open(video, 'r+') as inp:\n",
+    "            page_html = inp.read()\n",
+    "            parsed = bs4.BeautifulSoup(page_html, 'html.parser')\n",
+    "            \n",
+    "            class_watch = parsed.find_all(attrs={\"class\":re.compile(r\"watch\")})\n",
+    "            id_watch = parsed.find_all(attrs={\"id\":re.compile(r\"watch\")})\n",
+    "            channel = parsed.find_all(\"a\", attrs={\"href\":re.compile(r\"channel\")})\n",
+    "            meta = parsed.find_all(\"meta\")\n",
+    "            \n",
+    "            data = dict()\n",
+    "            \n",
+    "            for e in class_watch:\n",
+    "                colname = \"_\".join(e['class'])\n",
+    "                if \"clearfix\" in colname:\n",
+    "                    continue\n",
+    "                data[colname] = e.text.strip()\n",
+    "            \n",
+    "            for e in id_watch:\n",
+    "                colname = e['id']\n",
+    "                data[colname] = e.text.strip()\n",
+    "            \n",
+    "            for e in meta:\n",
+    "                colname = e.get('property')\n",
+    "                if colname is not None:\n",
+    "                    data[colname] = e['content']\n",
+    "                    \n",
+    "            for link_num, e in enumerate(channel):\n",
+    "                data[\"channel_link_{}\".format(link_num)] = e['href']\n",
+    "                \n",
+    "            output.write(\"{}\\n\".format(json.dumps(data)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Verificação"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(1615, 174)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.read_json(\"parsed_videos_info.json\", lines=True)\n",
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "useful_col = ['watch-title', 'og:url', 'watch-view-count', 'watch-time-text', 'content_watch-info-tag-list', 'watch7-headline', 'watch7-user-header', 'watch8-sentiment-actions', 'og:image', 'og:image:width', 'og:image:height', 'og:description', 'og:video:width', 'og:video:height', 'og:video:tag', 'channel_link_0']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>watch-title</th>\n",
+       "      <th>og:url</th>\n",
+       "      <th>watch-view-count</th>\n",
+       "      <th>watch-time-text</th>\n",
+       "      <th>content_watch-info-tag-list</th>\n",
+       "      <th>watch7-headline</th>\n",
+       "      <th>watch7-user-header</th>\n",
+       "      <th>watch8-sentiment-actions</th>\n",
+       "      <th>og:image</th>\n",
+       "      <th>og:image:width</th>\n",
+       "      <th>og:image:height</th>\n",
+       "      <th>og:description</th>\n",
+       "      <th>og:video:width</th>\n",
+       "      <th>og:video:height</th>\n",
+       "      <th>og:video:tag</th>\n",
+       "      <th>channel_link_0</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>#DataScience #Pandas #python Python Pandas Tut...</td>\n",
+       "      <td>https://www.youtube.com/watch?v=--EdOZqByHo</td>\n",
+       "      <td>62 visualizações</td>\n",
+       "      <td>Publicado em 11 de abr. de 2020</td>\n",
+       "      <td>Educação</td>\n",
+       "      <td>#DataScience #Pandas #python Python Pandas Tut...</td>\n",
+       "      <td>Code Mania\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nCarregan...</td>\n",
+       "      <td>62 visualizações\\n\\n\\n\\n\\n\\n\\n\\n4\\n\\nGostou de...</td>\n",
+       "      <td>https://i.ytimg.com/vi/--EdOZqByHo/hqdefault.jpg</td>\n",
+       "      <td>480</td>\n",
+       "      <td>360</td>\n",
+       "      <td>This pandas tutorial covers basics on datafram...</td>\n",
+       "      <td>640.0</td>\n",
+       "      <td>360.0</td>\n",
+       "      <td>python data science tutorial</td>\n",
+       "      <td>/channel/UCiO8B22LQBecxz9JjYrk7yA</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Machine Learning Course A To Z || Beginner to ...</td>\n",
+       "      <td>https://www.youtube.com/watch?v=-58kO_zYUGE</td>\n",
+       "      <td>174.642 visualizações</td>\n",
+       "      <td>Publicado em 10 de ago. de 2018</td>\n",
+       "      <td>Educação</td>\n",
+       "      <td>Machine Learning Course A To Z || Beginner to ...</td>\n",
+       "      <td>Geek's Lesson\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nCarre...</td>\n",
+       "      <td>174.642 visualizações\\n\\n\\n\\n\\n\\n\\n\\n5.121\\n\\n...</td>\n",
+       "      <td>https://i.ytimg.com/vi/-58kO_zYUGE/maxresdefau...</td>\n",
+       "      <td>1280</td>\n",
+       "      <td>720</td>\n",
+       "      <td>Welcome to this free online class on machine l...</td>\n",
+       "      <td>640.0</td>\n",
+       "      <td>360.0</td>\n",
+       "      <td>Ai and machine learning course</td>\n",
+       "      <td>/channel/UCKXx22vOENUyHrVAADq7Z_g</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                         watch-title  \\\n",
+       "0  #DataScience #Pandas #python Python Pandas Tut...   \n",
+       "1  Machine Learning Course A To Z || Beginner to ...   \n",
+       "\n",
+       "                                        og:url       watch-view-count  \\\n",
+       "0  https://www.youtube.com/watch?v=--EdOZqByHo       62 visualizações   \n",
+       "1  https://www.youtube.com/watch?v=-58kO_zYUGE  174.642 visualizações   \n",
+       "\n",
+       "                   watch-time-text content_watch-info-tag-list  \\\n",
+       "0  Publicado em 11 de abr. de 2020                    Educação   \n",
+       "1  Publicado em 10 de ago. de 2018                    Educação   \n",
+       "\n",
+       "                                     watch7-headline  \\\n",
+       "0  #DataScience #Pandas #python Python Pandas Tut...   \n",
+       "1  Machine Learning Course A To Z || Beginner to ...   \n",
+       "\n",
+       "                                  watch7-user-header  \\\n",
+       "0  Code Mania\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nCarregan...   \n",
+       "1  Geek's Lesson\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nCarre...   \n",
+       "\n",
+       "                            watch8-sentiment-actions  \\\n",
+       "0  62 visualizações\\n\\n\\n\\n\\n\\n\\n\\n4\\n\\nGostou de...   \n",
+       "1  174.642 visualizações\\n\\n\\n\\n\\n\\n\\n\\n5.121\\n\\n...   \n",
+       "\n",
+       "                                            og:image  og:image:width  \\\n",
+       "0   https://i.ytimg.com/vi/--EdOZqByHo/hqdefault.jpg             480   \n",
+       "1  https://i.ytimg.com/vi/-58kO_zYUGE/maxresdefau...            1280   \n",
+       "\n",
+       "   og:image:height                                     og:description  \\\n",
+       "0              360  This pandas tutorial covers basics on datafram...   \n",
+       "1              720  Welcome to this free online class on machine l...   \n",
+       "\n",
+       "   og:video:width  og:video:height                    og:video:tag  \\\n",
+       "0           640.0            360.0    python data science tutorial   \n",
+       "1           640.0            360.0  Ai and machine learning course   \n",
+       "\n",
+       "                      channel_link_0  \n",
+       "0  /channel/UCiO8B22LQBecxz9JjYrk7yA  \n",
+       "1  /channel/UCKXx22vOENUyHrVAADq7Z_g  "
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[useful_col].head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[useful_col].to_feather(\"raw_data.feather\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[useful_col].to_csv(\"raw_data_without_labels.csv\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}