Add Scrapping code in Automation Directory (#16)

GrayFlash · web-flow · commit 0b8b479271dc · 2020-10-03T14:43:43.000+02:00
* Add Scrapping code in Automation Directory

* Update requirements.txt Alphabetically

* Fixed a mistake renamed with .ipynb extension

* Update News Scrapper code

* Update Netflix code remove try block

* Delete .ipynb checkpoints

* Update NetlfixScrapper to clarify variable name

* Requirement.txt

* Removed variables with single line lifetime

* Few changes and delete Netflix Scrapper

* delete Netflix Scrapper
diff --git a/Automaton/ScrapNewsfromIndiaToday.ipynb b/Automaton/ScrapNewsfromIndiaToday.ipynb
@@ -0,0 +1,88 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from bs4 import BeautifulSoup\n",
+    "import requests"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def make_soup(url):\n",
+    "    return BeautifulSoup(requests.get(url).text, 'html.parser')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = 'https://www.indiatoday.in/top-stories'\n",
+    "indiatoday = 'https://www.indiatoday.in'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "top_stories = make_soup(url).find_all('div',{'class':'catagory-listing'})\n",
+    "articles_list = []\n",
+    "for story in top_stories:\n",
+    "    image = story.find('img')['src']\n",
+    "    title = story.find('a').text\n",
+    "    story_soup = make_soup(indiatoday + story.find('a')['href'])\n",
+    "    brief = story.find('p').text\n",
+    "    \n",
+    "    article = []\n",
+    "    for description in story_soup.find_all('div',{'class':'description'}): \n",
+    "        for paragraph in description.find_all('p'):\n",
+    "            article.append(paragraph.text)\n",
+    "\n",
+    "    articles_list.append([title, brief, article, image])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(articles_list, columns=['Title', 'Brief Intro', 'Paragraph', 'Image Url'])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,11 @@
+beautifulsoup4
 keras
 matplotlib
 mlxtend
 numpy
 pandas
+requests
 seaborn
 sklearn
-tensorflow
+tabulate
+tensorflow