Skip to content

Create instagram_crawler.py #2509

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Sep 29, 2020
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/codespell.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
- run: pip install codespell flake8
- run: |
SKIP="./.*,./other/dictionary.txt,./other/words,./project_euler/problem_22/p022_names.txt"
codespell --ignore-words-list=ans,fo,hist,iff,secant,som,tim --skip=$SKIP --quiet-level=2
codespell --ignore-words-list=ans,fo,followings,hist,iff,secant,som,tim --skip=$SKIP --quiet-level=2
- name: Codespell comment
if: ${{ failure() }}
uses: plettich/python_codespell_action@master
142 changes: 142 additions & 0 deletions web_programming/instagram_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#!/usr/bin/env python3
from __future__ import annotations

import json

import requests
from bs4 import BeautifulSoup

headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}


def extract_user_profile(script) -> dict:
"""
May raise json.decoder.JSONDecodeError
"""
data = script.contents[0]
info = json.loads(data[data.find('{"config"') : -1])
return info["entry_data"]["ProfilePage"][0]["graphql"]["user"]


class InstagramUser:
"""
Class Instagram crawl instagram user information

Usage:
>>> instagram_user = InstagramUser("github")
>>> instagram_user.is_verified
True
>>> instagram_user.biography
'Built for developers.'
"""

def __init__(self, username):
self.url = f"https://www.instagram.com/{username}/"
self.user_data = self.get_json()

def get_json(self) -> dict:
"""
Return a dict of user information
"""
html = requests.get(self.url, headers=headers).text
scripts = BeautifulSoup(html, "html.parser").find_all("script")
try:
return extract_user_profile(scripts[4])
except (json.decoder.JSONDecodeError, KeyError):
return extract_user_profile(scripts[3])

def __repr__(self) -> str:
return f"{self.__class__.__name__}({self.username})"

def __str__(self) -> str:
return f"Instagram user {self.fullname} ({self.username}) is {self.biography}"

@property
def username(self) -> str:
return self.user_data["username"]

@property
def fullname(self) -> str:
return self.user_data["full_name"]

@property
def biography(self) -> str:
return self.user_data["biography"]

@property
def email(self) -> str:
return self.user_data["business_email"]

@property
def website(self) -> str:
return self.user_data["external_url"]

@property
def number_of_followers(self) -> int:
return self.user_data["edge_followed_by"]["count"]

@property
def number_of_followings(self) -> int:
return self.user_data["edge_follow"]["count"]

@property
def number_of_posts(self) -> int:
return self.user_data["edge_owner_to_timeline_media"]["count"]

@property
def profile_picture_url(self) -> str:
return self.user_data["profile_pic_url_hd"]

@property
def is_verified(self) -> bool:
return self.user_data["is_verified"]

@property
def is_private(self) -> bool:
return self.user_data["is_private"]


def test_instagram_user(username: str = "github") -> None:
"""
A self running doctest
>>> test_instagram_user()
"""
from os import getenv

if getenv("CONTINUOUS_INTEGRATION"):
return # test failing on Travis CI
instagram_user = InstagramUser(username)
assert instagram_user.user_data
assert isinstance(instagram_user.user_data, dict)
assert instagram_user.username == username
if username != "github":
return
assert instagram_user.fullname == "GitHub"
assert instagram_user.biography == "Built for developers."
assert instagram_user.number_of_posts > 150
assert instagram_user.number_of_followers > 120000
assert instagram_user.number_of_followings > 15
assert instagram_user.email == "[email protected]"
assert instagram_user.website == "https://github.com/readme"
assert instagram_user.profile_picture_url.startswith("https://instagram.")
assert instagram_user.is_verified is True
assert instagram_user.is_private is False


if __name__ == "__main__":
import doctest

doctest.testmod()
instagram_user = InstagramUser("github")
print(instagram_user)
print(f"{instagram_user.number_of_posts = }")
print(f"{instagram_user.number_of_followers = }")
print(f"{instagram_user.number_of_followings = }")
print(f"{instagram_user.email = }")
print(f"{instagram_user.website = }")
print(f"{instagram_user.profile_picture_url = }")
print(f"{instagram_user.is_verified = }")
print(f"{instagram_user.is_private = }")