Skip to content

Create instagram_crawler.py #2508

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 24 additions & 34 deletions web_programming/instagram_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
from bs4 import BeautifulSoup
import json

headers = \
{
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"

}

# Usage
"""
Expand All @@ -26,15 +26,15 @@ class Instagram(object):

def __init__(self, username):
self.username = username
self.url = 'https://www.instagram.com/{}/'.format(username)
self.url = "https://www.instagram.com/{}/".format(username)

def get_json(self):
"""
return json of user information
"""

html = requests.get(self.url, headers=headers)
soup = BeautifulSoup(html.text, 'html.parser')
soup = BeautifulSoup(html.text, "html.parser")
try:
info = html_1(soup)
return info
Expand All @@ -48,7 +48,7 @@ def get_followers(self):
"""

info = self.get_json()
followers = info['edge_followed_by']['count']
followers = info["edge_followed_by"]["count"]
return followers

def get_followings(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def get_followings(self):
def get_number_of_followings(self) -> int:

Expand All @@ -57,7 +57,7 @@ def get_followings(self):
"""

info = self.get_json()
following = info['edge_follow']['count']
following = info["edge_follow"]["count"]
return following

def get_posts(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def get_posts(self):
def get_number_of_posts(self) -> int:

Expand All @@ -66,7 +66,7 @@ def get_posts(self):
"""

info = self.get_json()
posts = info['edge_owner_to_timeline_media']['count']
posts = info["edge_owner_to_timeline_media"]["count"]
return posts

def get_biography(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def get_biography(self):
def get_biography(self) -> str:

Expand All @@ -75,7 +75,7 @@ def get_biography(self):
"""

info = self.get_json()
bio = info['biography']
bio = info["biography"]
return bio

def get_fullname(self):
Expand All @@ -84,7 +84,7 @@ def get_fullname(self):
"""

info = self.get_json()
fullname = info['full_name']
fullname = info["full_name"]
return fullname

def get_username(self):
Expand All @@ -93,7 +93,7 @@ def get_username(self):
"""

info = self.get_json()
username = info['username']
username = info["username"]
return username

def get_profile_pic(self):
Expand All @@ -102,7 +102,7 @@ def get_profile_pic(self):
"""

info = self.get_json()
pic = info['profile_pic_url_hd']
pic = info["profile_pic_url_hd"]
return pic

def get_website(self):
Expand All @@ -111,66 +111,56 @@ def get_website(self):
"""

info = self.get_json()
external_url = info['external_url']
external_url = info["external_url"]
return external_url

def get_email(self):
"""
return the email id of user if
return the email id of user if
available
"""

info = self.get_json()
return info['business_email']
return info["business_email"]

def is_verified(self):
"""
check the user is verified
"""

info = self.get_json()
return info['is_verified']
return info["is_verified"]

def is_private(self):
"""
check user is private
"""

info = self.get_json()
return info['is_private']
return info["is_private"]


def html_1(soup):
"""
parse the html type-1 of instagram
page
"""

scripts = soup.find_all('script')
scripts = soup.find_all("script")
main_scripts = scripts[4]
data = main_scripts.contents[0]
info_object = data[data.find('{"config"'):-1]
info_object = data[data.find('{"config"') : -1]
info = json.loads(info_object)
info = info['entry_data']['ProfilePage'][0]['graphql']['user']
info = info["entry_data"]["ProfilePage"][0]["graphql"]["user"]
return info


def html_2(soup):
"""
if html_1 fails, html_2 in action
parse the html type-2 of instagram
page
"""
scripts = soup.find_all('script')
scripts = soup.find_all("script")
main_scripts = scripts[3]
data = main_scripts.contents[0]
info_object = data[data.find('{"config"'):-1]
info_object = data[data.find('{"config"') : -1]
info = json.loads(info_object)
info = info['entry_data']['ProfilePage'][0]['graphql']['user']
info = info["entry_data"]["ProfilePage"][0]["graphql"]["user"]
return info


user = Instagram('github')
user = Instagram("github")
print(user.is_verified())
print(user.get_biography())