From 6428bf5e06c67b1584059f76b4f5749887139613 Mon Sep 17 00:00:00 2001 From: YOGESHWARAN R Date: Tue, 29 Sep 2020 15:16:04 +0530 Subject: [PATCH 1/4] Create instagram_crawler.py It crawls the Instagram page of the user and Scarpe the data. --- web_programming/instagram_crawler.py | 176 +++++++++++++++++++++++++++ 1 file changed, 176 insertions(+) create mode 100644 web_programming/instagram_crawler.py diff --git a/web_programming/instagram_crawler.py b/web_programming/instagram_crawler.py new file mode 100644 index 000000000000..83f1fe1f5455 --- /dev/null +++ b/web_programming/instagram_crawler.py @@ -0,0 +1,176 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +import requests +from bs4 import BeautifulSoup +import json + +headers = \ + { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'} + +# Usage +""" +>>> user = Instagram("github") +>>> user.is_verified() +True +>>> user.get_biography() +Built for developers. + +""" + + +class Instagram(object): + """ + Class Instagram crawl instagram user information + """ + + def __init__(self, username): + self.username = username + self.url = 'https://www.instagram.com/{}/'.format(username) + + def get_json(self): + """ + return json of user information + """ + + html = requests.get(self.url, headers=headers) + soup = BeautifulSoup(html.text, 'html.parser') + try: + info = html_1(soup) + return info + except: + info = html_2(soup) + return info + + def get_followers(self): + """ + return number of followers + """ + + info = self.get_json() + followers = info['edge_followed_by']['count'] + return followers + + def get_followings(self): + """ + return number of followings + """ + + info = self.get_json() + following = info['edge_follow']['count'] + return following + + def get_posts(self): + """ + return number of posts + """ + + info = self.get_json() + posts = info['edge_owner_to_timeline_media']['count'] + return posts + + def get_biography(self): + """ + return biography of user + """ + + info = self.get_json() + bio = info['biography'] + return bio + + def get_fullname(self): + """ + return fullname of the user + """ + + info = self.get_json() + fullname = info['full_name'] + return fullname + + def get_username(self): + """ + return the username of the user + """ + + info = self.get_json() + username = info['username'] + return username + + def get_profile_pic(self): + """ + return the link of profile picture + """ + + info = self.get_json() + pic = info['profile_pic_url_hd'] + return pic + + def get_website(self): + """ + return the users's website link + """ + + info = self.get_json() + external_url = info['external_url'] + return external_url + + def get_email(self): + """ + return the email id of user if + available + """ + + info = self.get_json() + return info['business_email'] + + def is_verified(self): + """ + check the user is verified + """ + + info = self.get_json() + return info['is_verified'] + + def is_private(self): + """ + check user is private + """ + + info = self.get_json() + return info['is_private'] + + +def html_1(soup): + """ + parse the html type-1 of instagram + page + """ + + scripts = soup.find_all('script') + main_scripts = scripts[4] + data = main_scripts.contents[0] + info_object = data[data.find('{"config"'):-1] + info = json.loads(info_object) + info = info['entry_data']['ProfilePage'][0]['graphql']['user'] + return info + + +def html_2(soup): + """ + if html_1 fails, html_2 in action + parse the html type-2 of instagram + page + """ + scripts = soup.find_all('script') + main_scripts = scripts[3] + data = main_scripts.contents[0] + info_object = data[data.find('{"config"'):-1] + info = json.loads(info_object) + info = info['entry_data']['ProfilePage'][0]['graphql']['user'] + return info + + +user = Instagram('github') +print(user.is_verified()) +print(user.get_biography()) + From 7e359cf51376ca5eed001dcd8980911385165676 Mon Sep 17 00:00:00 2001 From: YOGESHWARAN R Date: Tue, 29 Sep 2020 16:38:09 +0530 Subject: [PATCH 2/4] Update instagram_crawler.py --- web_programming/instagram_crawler.py | 58 ++++++++++++---------------- 1 file changed, 24 insertions(+), 34 deletions(-) diff --git a/web_programming/instagram_crawler.py b/web_programming/instagram_crawler.py index 83f1fe1f5455..72d6a8f27727 100644 --- a/web_programming/instagram_crawler.py +++ b/web_programming/instagram_crawler.py @@ -4,9 +4,9 @@ from bs4 import BeautifulSoup import json -headers = \ - { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'} +headers = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36" +} # Usage """ @@ -26,7 +26,7 @@ class Instagram(object): def __init__(self, username): self.username = username - self.url = 'https://www.instagram.com/{}/'.format(username) + self.url = "https://www.instagram.com/{}/".format(username) def get_json(self): """ @@ -34,7 +34,7 @@ def get_json(self): """ html = requests.get(self.url, headers=headers) - soup = BeautifulSoup(html.text, 'html.parser') + soup = BeautifulSoup(html.text, "html.parser") try: info = html_1(soup) return info @@ -48,7 +48,7 @@ def get_followers(self): """ info = self.get_json() - followers = info['edge_followed_by']['count'] + followers = info["edge_followed_by"]["count"] return followers def get_followings(self): @@ -57,7 +57,7 @@ def get_followings(self): """ info = self.get_json() - following = info['edge_follow']['count'] + following = info["edge_follow"]["count"] return following def get_posts(self): @@ -66,7 +66,7 @@ def get_posts(self): """ info = self.get_json() - posts = info['edge_owner_to_timeline_media']['count'] + posts = info["edge_owner_to_timeline_media"]["count"] return posts def get_biography(self): @@ -75,7 +75,7 @@ def get_biography(self): """ info = self.get_json() - bio = info['biography'] + bio = info["biography"] return bio def get_fullname(self): @@ -84,7 +84,7 @@ def get_fullname(self): """ info = self.get_json() - fullname = info['full_name'] + fullname = info["full_name"] return fullname def get_username(self): @@ -93,7 +93,7 @@ def get_username(self): """ info = self.get_json() - username = info['username'] + username = info["username"] return username def get_profile_pic(self): @@ -102,7 +102,7 @@ def get_profile_pic(self): """ info = self.get_json() - pic = info['profile_pic_url_hd'] + pic = info["profile_pic_url_hd"] return pic def get_website(self): @@ -111,17 +111,17 @@ def get_website(self): """ info = self.get_json() - external_url = info['external_url'] + external_url = info["external_url"] return external_url def get_email(self): """ - return the email id of user if + return the email id of user if available """ info = self.get_json() - return info['business_email'] + return info["business_email"] def is_verified(self): """ @@ -129,7 +129,7 @@ def is_verified(self): """ info = self.get_json() - return info['is_verified'] + return info["is_verified"] def is_private(self): """ @@ -137,40 +137,30 @@ def is_private(self): """ info = self.get_json() - return info['is_private'] + return info["is_private"] def html_1(soup): - """ - parse the html type-1 of instagram - page - """ - - scripts = soup.find_all('script') + scripts = soup.find_all("script") main_scripts = scripts[4] data = main_scripts.contents[0] - info_object = data[data.find('{"config"'):-1] + info_object = data[data.find('{"config"') : -1] info = json.loads(info_object) - info = info['entry_data']['ProfilePage'][0]['graphql']['user'] + info = info["entry_data"]["ProfilePage"][0]["graphql"]["user"] return info def html_2(soup): - """ - if html_1 fails, html_2 in action - parse the html type-2 of instagram - page - """ - scripts = soup.find_all('script') + scripts = soup.find_all("script") main_scripts = scripts[3] data = main_scripts.contents[0] - info_object = data[data.find('{"config"'):-1] + info_object = data[data.find('{"config"') : -1] info = json.loads(info_object) - info = info['entry_data']['ProfilePage'][0]['graphql']['user'] + info = info["entry_data"]["ProfilePage"][0]["graphql"]["user"] return info -user = Instagram('github') +user = Instagram("github") print(user.is_verified()) print(user.get_biography()) From 8bc910461a5da538d5dcde8537ff81d74d8ba7e9 Mon Sep 17 00:00:00 2001 From: YOGESHWARAN R Date: Tue, 29 Sep 2020 16:39:17 +0530 Subject: [PATCH 3/4] changed class name to InstagramUser Co-authored-by: Christian Clauss --- web_programming/instagram_crawler.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/web_programming/instagram_crawler.py b/web_programming/instagram_crawler.py index 72d6a8f27727..c081c4b1e6e5 100644 --- a/web_programming/instagram_crawler.py +++ b/web_programming/instagram_crawler.py @@ -19,7 +19,7 @@ """ -class Instagram(object): +class InstagramUser: """ Class Instagram crawl instagram user information """ @@ -163,4 +163,3 @@ def html_2(soup): user = Instagram("github") print(user.is_verified()) print(user.get_biography()) - From bafda76c0ceea59b56ca14069617952247f42340 Mon Sep 17 00:00:00 2001 From: YOGESHWARAN R Date: Tue, 29 Sep 2020 17:13:44 +0530 Subject: [PATCH 4/4] some chamges had done --- web_programming/instagram_crawler.py | 80 +++++++++++++++------------- 1 file changed, 42 insertions(+), 38 deletions(-) diff --git a/web_programming/instagram_crawler.py b/web_programming/instagram_crawler.py index c081c4b1e6e5..26c80efa17ef 100644 --- a/web_programming/instagram_crawler.py +++ b/web_programming/instagram_crawler.py @@ -11,9 +11,9 @@ # Usage """ >>> user = Instagram("github") ->>> user.is_verified() +>>> user.is_verified True ->>> user.get_biography() +>>> user.get_biography Built for developers. """ @@ -26,7 +26,7 @@ class InstagramUser: def __init__(self, username): self.username = username - self.url = "https://www.instagram.com/{}/".format(username) + self.url = f"https://www.instagram.com/{username}/" def get_json(self): """ @@ -36,85 +36,84 @@ def get_json(self): html = requests.get(self.url, headers=headers) soup = BeautifulSoup(html.text, "html.parser") try: - info = html_1(soup) - return info - except: - info = html_2(soup) - return info + return html_1(soup) + except json.decoder.JSONDecodeError: + return html_2(soup) - def get_followers(self): + @property + def no_of_followers(self) -> int: """ return number of followers """ info = self.get_json() - followers = info["edge_followed_by"]["count"] - return followers + return info["edge_followed_by"]["count"] - def get_followings(self): + @property + def no_of_followings(self) -> int: """ return number of followings """ info = self.get_json() - following = info["edge_follow"]["count"] - return following + return info["edge_follow"]["count"] - def get_posts(self): + @property + def no_of_posts(self) -> int: """ return number of posts """ info = self.get_json() - posts = info["edge_owner_to_timeline_media"]["count"] - return posts + return info["edge_owner_to_timeline_media"]["count"] - def get_biography(self): + @property + def get_biography(self) -> str: """ return biography of user """ info = self.get_json() - bio = info["biography"] - return bio + return info["biography"] - def get_fullname(self): + @property + def get_fullname(self) -> str: """ return fullname of the user """ info = self.get_json() - fullname = info["full_name"] - return fullname + return info["full_name"] - def get_username(self): + @property + def get_username(self) -> str: """ return the username of the user """ info = self.get_json() - username = info["username"] - return username + return info["username"] - def get_profile_pic(self): + @property + def get_profile_pic(self) -> str: """ return the link of profile picture """ info = self.get_json() - pic = info["profile_pic_url_hd"] - return pic + return info["profile_pic_url_hd"] - def get_website(self): + @property + def get_website(self) -> str: """ return the users's website link """ info = self.get_json() - external_url = info["external_url"] - return external_url + return info["external_url"] - def get_email(self): + @property + def get_email(self) -> str: """ return the email id of user if available @@ -123,7 +122,8 @@ def get_email(self): info = self.get_json() return info["business_email"] - def is_verified(self): + @property + def is_verified(self) -> bool: """ check the user is verified """ @@ -131,7 +131,8 @@ def is_verified(self): info = self.get_json() return info["is_verified"] - def is_private(self): + @property + def is_private(self) -> bool: """ check user is private """ @@ -160,6 +161,9 @@ def html_2(soup): return info -user = Instagram("github") -print(user.is_verified()) -print(user.get_biography()) +if __name__ == "__main__": + user = InstagramUser("github") + print(f"{user.is_verified = }") + print(f"{user.get_biography = }") + +