Skip to content

Commit 0b8b479

Browse files
authored
Add Scrapping code in Automation Directory (#16)
* Add Scrapping code in Automation Directory * Update requirements.txt Alphabetically * Fixed a mistake renamed with .ipynb extension * Update News Scrapper code * Update Netflix code remove try block * Delete .ipynb checkpoints * Update NetlfixScrapper to clarify variable name * Requirement.txt * Removed variables with single line lifetime * Few changes and delete Netflix Scrapper * delete Netflix Scrapper
1 parent 054be02 commit 0b8b479

File tree

2 files changed

+92
-1
lines changed

2 files changed

+92
-1
lines changed
+88
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import numpy as np\n",
10+
"import pandas as pd\n",
11+
"from bs4 import BeautifulSoup\n",
12+
"import requests"
13+
]
14+
},
15+
{
16+
"cell_type": "code",
17+
"execution_count": 2,
18+
"metadata": {},
19+
"outputs": [],
20+
"source": [
21+
"def make_soup(url):\n",
22+
" return BeautifulSoup(requests.get(url).text, 'html.parser')"
23+
]
24+
},
25+
{
26+
"cell_type": "code",
27+
"execution_count": 3,
28+
"metadata": {},
29+
"outputs": [],
30+
"source": [
31+
"url = 'https://www.indiatoday.in/top-stories'\n",
32+
"indiatoday = 'https://www.indiatoday.in'"
33+
]
34+
},
35+
{
36+
"cell_type": "code",
37+
"execution_count": 4,
38+
"metadata": {},
39+
"outputs": [],
40+
"source": [
41+
"top_stories = make_soup(url).find_all('div',{'class':'catagory-listing'})\n",
42+
"articles_list = []\n",
43+
"for story in top_stories:\n",
44+
" image = story.find('img')['src']\n",
45+
" title = story.find('a').text\n",
46+
" story_soup = make_soup(indiatoday + story.find('a')['href'])\n",
47+
" brief = story.find('p').text\n",
48+
" \n",
49+
" article = []\n",
50+
" for description in story_soup.find_all('div',{'class':'description'}): \n",
51+
" for paragraph in description.find_all('p'):\n",
52+
" article.append(paragraph.text)\n",
53+
"\n",
54+
" articles_list.append([title, brief, article, image])"
55+
]
56+
},
57+
{
58+
"cell_type": "code",
59+
"execution_count": 5,
60+
"metadata": {},
61+
"outputs": [],
62+
"source": [
63+
"df = pd.DataFrame(articles_list, columns=['Title', 'Brief Intro', 'Paragraph', 'Image Url'])"
64+
]
65+
}
66+
],
67+
"metadata": {
68+
"kernelspec": {
69+
"display_name": "Python 3",
70+
"language": "python",
71+
"name": "python3"
72+
},
73+
"language_info": {
74+
"codemirror_mode": {
75+
"name": "ipython",
76+
"version": 3
77+
},
78+
"file_extension": ".py",
79+
"mimetype": "text/x-python",
80+
"name": "python",
81+
"nbconvert_exporter": "python",
82+
"pygments_lexer": "ipython3",
83+
"version": "3.7.6"
84+
}
85+
},
86+
"nbformat": 4,
87+
"nbformat_minor": 4
88+
}

requirements.txt

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
1+
beautifulsoup4
12
keras
23
matplotlib
34
mlxtend
45
numpy
56
pandas
7+
requests
68
seaborn
79
sklearn
8-
tensorflow
10+
tabulate
11+
tensorflow

0 commit comments

Comments
 (0)