Skip to content

Commit 7893c8d

Browse files
committed
notebooks
1 parent e94c4d1 commit 7893c8d

File tree

7 files changed

+220022
-0
lines changed

7 files changed

+220022
-0
lines changed

1-data-scraping.ipynb

Lines changed: 560 additions & 0 deletions
Large diffs are not rendered by default.

2-pre-process.ipynb

Lines changed: 382 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,382 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# 2. Data pre processing"
8+
]
9+
},
10+
{
11+
"cell_type": "code",
12+
"execution_count": 1,
13+
"metadata": {},
14+
"outputs": [],
15+
"source": [
16+
"import pandas as pd\n",
17+
"import numpy as np\n",
18+
"import re\n",
19+
"import time\n",
20+
"import json\n",
21+
"\n",
22+
"import requests as rq\n",
23+
"import bs4 as bs4\n",
24+
"import tqdm \n",
25+
"import glob\n",
26+
"import feather"
27+
]
28+
},
29+
{
30+
"cell_type": "code",
31+
"execution_count": 3,
32+
"metadata": {},
33+
"outputs": [],
34+
"source": [
35+
"df = pd.read_json(\"parsed_videos.json\", lines=True)"
36+
]
37+
},
38+
{
39+
"cell_type": "code",
40+
"execution_count": 4,
41+
"metadata": {},
42+
"outputs": [],
43+
"source": [
44+
"list_of_links = df['link'].unique()"
45+
]
46+
},
47+
{
48+
"cell_type": "code",
49+
"execution_count": 5,
50+
"metadata": {},
51+
"outputs": [
52+
{
53+
"name": "stderr",
54+
"output_type": "stream",
55+
"text": [
56+
"/home/lucifer/DataScience/boss/lib/python3.6/site-packages/ipykernel_launcher.py:3: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0\n",
57+
"Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`\n",
58+
" This is separate from the ipykernel package so we can avoid doing imports until\n"
59+
]
60+
},
61+
{
62+
"data": {
63+
"application/vnd.jupyter.widget-view+json": {
64+
"model_id": "e5a3b01a63464722ad786808e0601406",
65+
"version_major": 2,
66+
"version_minor": 0
67+
},
68+
"text/plain": [
69+
"HBox(children=(FloatProgress(value=0.0, max=1635.0), HTML(value='')))"
70+
]
71+
},
72+
"metadata": {},
73+
"output_type": "display_data"
74+
},
75+
{
76+
"name": "stdout",
77+
"output_type": "stream",
78+
"text": [
79+
"\n"
80+
]
81+
}
82+
],
83+
"source": [
84+
"url = 'https://www.youtube.com{link}'\n",
85+
"\n",
86+
"for link in tqdm.tqdm_notebook(list_of_links):\n",
87+
" urll = url.format(link=link)\n",
88+
" response = rq.get(urll)\n",
89+
" \n",
90+
" link_name = re.search(\"v=(.*)\", link).group(1)\n",
91+
" \n",
92+
" with open(\"./raw_data/videos/video_{}.html\".format(link_name), 'w+') as output:\n",
93+
" output.write(response.text)"
94+
]
95+
},
96+
{
97+
"cell_type": "code",
98+
"execution_count": 2,
99+
"metadata": {},
100+
"outputs": [
101+
{
102+
"name": "stderr",
103+
"output_type": "stream",
104+
"text": [
105+
"/home/lucifer/DataScience/boss/lib/python3.6/site-packages/ipykernel_launcher.py:2: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0\n",
106+
"Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`\n",
107+
" \n"
108+
]
109+
},
110+
{
111+
"data": {
112+
"application/vnd.jupyter.widget-view+json": {
113+
"model_id": "f635bd8f344b4c2abe826656d341b4a6",
114+
"version_major": 2,
115+
"version_minor": 0
116+
},
117+
"text/plain": [
118+
"HBox(children=(FloatProgress(value=0.0, max=1615.0), HTML(value='')))"
119+
]
120+
},
121+
"metadata": {},
122+
"output_type": "display_data"
123+
},
124+
{
125+
"name": "stdout",
126+
"output_type": "stream",
127+
"text": [
128+
"\n"
129+
]
130+
}
131+
],
132+
"source": [
133+
"with open(\"parsed_videos_info.json\", 'w+') as output:\n",
134+
" for video in tqdm.tqdm_notebook(sorted(glob.glob(\"./raw_data/videos/*\"))):\n",
135+
" with open(video, 'r+') as inp:\n",
136+
" page_html = inp.read()\n",
137+
" parsed = bs4.BeautifulSoup(page_html, 'html.parser')\n",
138+
" \n",
139+
" class_watch = parsed.find_all(attrs={\"class\":re.compile(r\"watch\")})\n",
140+
" id_watch = parsed.find_all(attrs={\"id\":re.compile(r\"watch\")})\n",
141+
" channel = parsed.find_all(\"a\", attrs={\"href\":re.compile(r\"channel\")})\n",
142+
" meta = parsed.find_all(\"meta\")\n",
143+
" \n",
144+
" data = dict()\n",
145+
" \n",
146+
" for e in class_watch:\n",
147+
" colname = \"_\".join(e['class'])\n",
148+
" if \"clearfix\" in colname:\n",
149+
" continue\n",
150+
" data[colname] = e.text.strip()\n",
151+
" \n",
152+
" for e in id_watch:\n",
153+
" colname = e['id']\n",
154+
" data[colname] = e.text.strip()\n",
155+
" \n",
156+
" for e in meta:\n",
157+
" colname = e.get('property')\n",
158+
" if colname is not None:\n",
159+
" data[colname] = e['content']\n",
160+
" \n",
161+
" for link_num, e in enumerate(channel):\n",
162+
" data[\"channel_link_{}\".format(link_num)] = e['href']\n",
163+
" \n",
164+
" output.write(\"{}\\n\".format(json.dumps(data)))"
165+
]
166+
},
167+
{
168+
"cell_type": "markdown",
169+
"metadata": {},
170+
"source": [
171+
"Verificação"
172+
]
173+
},
174+
{
175+
"cell_type": "code",
176+
"execution_count": 3,
177+
"metadata": {},
178+
"outputs": [
179+
{
180+
"data": {
181+
"text/plain": [
182+
"(1615, 174)"
183+
]
184+
},
185+
"execution_count": 3,
186+
"metadata": {},
187+
"output_type": "execute_result"
188+
}
189+
],
190+
"source": [
191+
"df = pd.read_json(\"parsed_videos_info.json\", lines=True)\n",
192+
"df.shape"
193+
]
194+
},
195+
{
196+
"cell_type": "code",
197+
"execution_count": 6,
198+
"metadata": {},
199+
"outputs": [],
200+
"source": [
201+
"useful_col = ['watch-title', 'og:url', 'watch-view-count', 'watch-time-text', 'content_watch-info-tag-list', 'watch7-headline', 'watch7-user-header', 'watch8-sentiment-actions', 'og:image', 'og:image:width', 'og:image:height', 'og:description', 'og:video:width', 'og:video:height', 'og:video:tag', 'channel_link_0']"
202+
]
203+
},
204+
{
205+
"cell_type": "code",
206+
"execution_count": 7,
207+
"metadata": {},
208+
"outputs": [
209+
{
210+
"data": {
211+
"text/html": [
212+
"<div>\n",
213+
"<style scoped>\n",
214+
" .dataframe tbody tr th:only-of-type {\n",
215+
" vertical-align: middle;\n",
216+
" }\n",
217+
"\n",
218+
" .dataframe tbody tr th {\n",
219+
" vertical-align: top;\n",
220+
" }\n",
221+
"\n",
222+
" .dataframe thead th {\n",
223+
" text-align: right;\n",
224+
" }\n",
225+
"</style>\n",
226+
"<table border=\"1\" class=\"dataframe\">\n",
227+
" <thead>\n",
228+
" <tr style=\"text-align: right;\">\n",
229+
" <th></th>\n",
230+
" <th>watch-title</th>\n",
231+
" <th>og:url</th>\n",
232+
" <th>watch-view-count</th>\n",
233+
" <th>watch-time-text</th>\n",
234+
" <th>content_watch-info-tag-list</th>\n",
235+
" <th>watch7-headline</th>\n",
236+
" <th>watch7-user-header</th>\n",
237+
" <th>watch8-sentiment-actions</th>\n",
238+
" <th>og:image</th>\n",
239+
" <th>og:image:width</th>\n",
240+
" <th>og:image:height</th>\n",
241+
" <th>og:description</th>\n",
242+
" <th>og:video:width</th>\n",
243+
" <th>og:video:height</th>\n",
244+
" <th>og:video:tag</th>\n",
245+
" <th>channel_link_0</th>\n",
246+
" </tr>\n",
247+
" </thead>\n",
248+
" <tbody>\n",
249+
" <tr>\n",
250+
" <th>0</th>\n",
251+
" <td>#DataScience #Pandas #python Python Pandas Tut...</td>\n",
252+
" <td>https://www.youtube.com/watch?v=--EdOZqByHo</td>\n",
253+
" <td>62 visualizações</td>\n",
254+
" <td>Publicado em 11 de abr. de 2020</td>\n",
255+
" <td>Educação</td>\n",
256+
" <td>#DataScience #Pandas #python Python Pandas Tut...</td>\n",
257+
" <td>Code Mania\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nCarregan...</td>\n",
258+
" <td>62 visualizações\\n\\n\\n\\n\\n\\n\\n\\n4\\n\\nGostou de...</td>\n",
259+
" <td>https://i.ytimg.com/vi/--EdOZqByHo/hqdefault.jpg</td>\n",
260+
" <td>480</td>\n",
261+
" <td>360</td>\n",
262+
" <td>This pandas tutorial covers basics on datafram...</td>\n",
263+
" <td>640.0</td>\n",
264+
" <td>360.0</td>\n",
265+
" <td>python data science tutorial</td>\n",
266+
" <td>/channel/UCiO8B22LQBecxz9JjYrk7yA</td>\n",
267+
" </tr>\n",
268+
" <tr>\n",
269+
" <th>1</th>\n",
270+
" <td>Machine Learning Course A To Z || Beginner to ...</td>\n",
271+
" <td>https://www.youtube.com/watch?v=-58kO_zYUGE</td>\n",
272+
" <td>174.642 visualizações</td>\n",
273+
" <td>Publicado em 10 de ago. de 2018</td>\n",
274+
" <td>Educação</td>\n",
275+
" <td>Machine Learning Course A To Z || Beginner to ...</td>\n",
276+
" <td>Geek's Lesson\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nCarre...</td>\n",
277+
" <td>174.642 visualizações\\n\\n\\n\\n\\n\\n\\n\\n5.121\\n\\n...</td>\n",
278+
" <td>https://i.ytimg.com/vi/-58kO_zYUGE/maxresdefau...</td>\n",
279+
" <td>1280</td>\n",
280+
" <td>720</td>\n",
281+
" <td>Welcome to this free online class on machine l...</td>\n",
282+
" <td>640.0</td>\n",
283+
" <td>360.0</td>\n",
284+
" <td>Ai and machine learning course</td>\n",
285+
" <td>/channel/UCKXx22vOENUyHrVAADq7Z_g</td>\n",
286+
" </tr>\n",
287+
" </tbody>\n",
288+
"</table>\n",
289+
"</div>"
290+
],
291+
"text/plain": [
292+
" watch-title \\\n",
293+
"0 #DataScience #Pandas #python Python Pandas Tut... \n",
294+
"1 Machine Learning Course A To Z || Beginner to ... \n",
295+
"\n",
296+
" og:url watch-view-count \\\n",
297+
"0 https://www.youtube.com/watch?v=--EdOZqByHo 62 visualizações \n",
298+
"1 https://www.youtube.com/watch?v=-58kO_zYUGE 174.642 visualizações \n",
299+
"\n",
300+
" watch-time-text content_watch-info-tag-list \\\n",
301+
"0 Publicado em 11 de abr. de 2020 Educação \n",
302+
"1 Publicado em 10 de ago. de 2018 Educação \n",
303+
"\n",
304+
" watch7-headline \\\n",
305+
"0 #DataScience #Pandas #python Python Pandas Tut... \n",
306+
"1 Machine Learning Course A To Z || Beginner to ... \n",
307+
"\n",
308+
" watch7-user-header \\\n",
309+
"0 Code Mania\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nCarregan... \n",
310+
"1 Geek's Lesson\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nCarre... \n",
311+
"\n",
312+
" watch8-sentiment-actions \\\n",
313+
"0 62 visualizações\\n\\n\\n\\n\\n\\n\\n\\n4\\n\\nGostou de... \n",
314+
"1 174.642 visualizações\\n\\n\\n\\n\\n\\n\\n\\n5.121\\n\\n... \n",
315+
"\n",
316+
" og:image og:image:width \\\n",
317+
"0 https://i.ytimg.com/vi/--EdOZqByHo/hqdefault.jpg 480 \n",
318+
"1 https://i.ytimg.com/vi/-58kO_zYUGE/maxresdefau... 1280 \n",
319+
"\n",
320+
" og:image:height og:description \\\n",
321+
"0 360 This pandas tutorial covers basics on datafram... \n",
322+
"1 720 Welcome to this free online class on machine l... \n",
323+
"\n",
324+
" og:video:width og:video:height og:video:tag \\\n",
325+
"0 640.0 360.0 python data science tutorial \n",
326+
"1 640.0 360.0 Ai and machine learning course \n",
327+
"\n",
328+
" channel_link_0 \n",
329+
"0 /channel/UCiO8B22LQBecxz9JjYrk7yA \n",
330+
"1 /channel/UCKXx22vOENUyHrVAADq7Z_g "
331+
]
332+
},
333+
"execution_count": 7,
334+
"metadata": {},
335+
"output_type": "execute_result"
336+
}
337+
],
338+
"source": [
339+
"df[useful_col].head(2)"
340+
]
341+
},
342+
{
343+
"cell_type": "code",
344+
"execution_count": 25,
345+
"metadata": {},
346+
"outputs": [],
347+
"source": [
348+
"df[useful_col].to_feather(\"raw_data.feather\")"
349+
]
350+
},
351+
{
352+
"cell_type": "code",
353+
"execution_count": 26,
354+
"metadata": {},
355+
"outputs": [],
356+
"source": [
357+
"df[useful_col].to_csv(\"raw_data_without_labels.csv\")"
358+
]
359+
}
360+
],
361+
"metadata": {
362+
"kernelspec": {
363+
"display_name": "Python 3",
364+
"language": "python",
365+
"name": "python3"
366+
},
367+
"language_info": {
368+
"codemirror_mode": {
369+
"name": "ipython",
370+
"version": 3
371+
},
372+
"file_extension": ".py",
373+
"mimetype": "text/x-python",
374+
"name": "python",
375+
"nbconvert_exporter": "python",
376+
"pygments_lexer": "ipython3",
377+
"version": "3.6.9"
378+
}
379+
},
380+
"nbformat": 4,
381+
"nbformat_minor": 4
382+
}

0 commit comments

Comments
 (0)