Skip to content

Commit 3a49cbf

Browse files
yogeshwaran01cclaussdhruvmanila
authored andcommitted
Create instagram_crawler.py (TheAlgorithms#2509)
* Create instagram_crawler.py * codespell --ignore-words-list=followings * Update web_programming/instagram_crawler.py Co-authored-by: Christian Clauss <[email protected]> * Update web_programming/instagram_crawler.py Co-authored-by: Christian Clauss <[email protected]> * Update web_programming/instagram_crawler.py Co-authored-by: Christian Clauss <[email protected]> * Update web_programming/instagram_crawler.py Co-authored-by: Christian Clauss <[email protected]> * Update web_programming/instagram_crawler.py Co-authored-by: Christian Clauss <[email protected]> * Update web_programming/instagram_crawler.py Co-authored-by: Christian Clauss <[email protected]> * Update web_programming/instagram_crawler.py Co-authored-by: Christian Clauss <[email protected]> * Update instagram_crawler.py * Add doctests * fixup! except (json.decoder.JSONDecodeError, KeyError): * if getenv("CONTINUOUS_INTEGRATION"): return * Update instagram_crawler.py * Update web_programming/instagram_crawler.py Co-authored-by: Dhruv <[email protected]> * added fake_useragent * Update instagram_crawler.py * Comment out doctests Co-authored-by: Christian Clauss <[email protected]> Co-authored-by: Dhruv <[email protected]>
1 parent c576c97 commit 3a49cbf

File tree

2 files changed

+141
-1
lines changed

2 files changed

+141
-1
lines changed

Diff for: .github/workflows/codespell.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
- run: pip install codespell flake8
1212
- run: |
1313
SKIP="./.*,./other/dictionary.txt,./other/words,./project_euler/problem_22/p022_names.txt"
14-
codespell --ignore-words-list=ans,fo,hist,iff,secant,som,tim --skip=$SKIP --quiet-level=2
14+
codespell --ignore-words-list=ans,fo,followings,hist,iff,secant,som,tim --skip=$SKIP --quiet-level=2
1515
- name: Codespell comment
1616
if: ${{ failure() }}
1717
uses: plettich/python_codespell_action@master

Diff for: web_programming/instagram_crawler.py

+140
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
#!/usr/bin/env python3
2+
from __future__ import annotations
3+
4+
import json
5+
6+
import requests
7+
from bs4 import BeautifulSoup
8+
from fake_useragent import UserAgent
9+
10+
headers = {"UserAgent": UserAgent().random}
11+
12+
13+
def extract_user_profile(script) -> dict:
14+
"""
15+
May raise json.decoder.JSONDecodeError
16+
"""
17+
data = script.contents[0]
18+
info = json.loads(data[data.find('{"config"'): -1])
19+
return info["entry_data"]["ProfilePage"][0]["graphql"]["user"]
20+
21+
22+
class InstagramUser:
23+
"""
24+
Class Instagram crawl instagram user information
25+
26+
Usage: (doctest failing on Travis CI)
27+
# >>> instagram_user = InstagramUser("github")
28+
# >>> instagram_user.is_verified
29+
True
30+
# >>> instagram_user.biography
31+
'Built for developers.'
32+
"""
33+
34+
def __init__(self, username):
35+
self.url = f"https://www.instagram.com/{username}/"
36+
self.user_data = self.get_json()
37+
38+
def get_json(self) -> dict:
39+
"""
40+
Return a dict of user information
41+
"""
42+
html = requests.get(self.url, headers=headers).text
43+
scripts = BeautifulSoup(html, "html.parser").find_all("script")
44+
try:
45+
return extract_user_profile(scripts[4])
46+
except (json.decoder.JSONDecodeError, KeyError):
47+
return extract_user_profile(scripts[3])
48+
49+
def __repr__(self) -> str:
50+
return f"{self.__class__.__name__}('{self.username}')"
51+
52+
def __str__(self) -> str:
53+
return f"{self.fullname} ({self.username}) is {self.biography}"
54+
55+
@property
56+
def username(self) -> str:
57+
return self.user_data["username"]
58+
59+
@property
60+
def fullname(self) -> str:
61+
return self.user_data["full_name"]
62+
63+
@property
64+
def biography(self) -> str:
65+
return self.user_data["biography"]
66+
67+
@property
68+
def email(self) -> str:
69+
return self.user_data["business_email"]
70+
71+
@property
72+
def website(self) -> str:
73+
return self.user_data["external_url"]
74+
75+
@property
76+
def number_of_followers(self) -> int:
77+
return self.user_data["edge_followed_by"]["count"]
78+
79+
@property
80+
def number_of_followings(self) -> int:
81+
return self.user_data["edge_follow"]["count"]
82+
83+
@property
84+
def number_of_posts(self) -> int:
85+
return self.user_data["edge_owner_to_timeline_media"]["count"]
86+
87+
@property
88+
def profile_picture_url(self) -> str:
89+
return self.user_data["profile_pic_url_hd"]
90+
91+
@property
92+
def is_verified(self) -> bool:
93+
return self.user_data["is_verified"]
94+
95+
@property
96+
def is_private(self) -> bool:
97+
return self.user_data["is_private"]
98+
99+
100+
def test_instagram_user(username: str = "github") -> None:
101+
"""
102+
A self running doctest
103+
>>> test_instagram_user()
104+
"""
105+
from os import getenv
106+
107+
if getenv("CONTINUOUS_INTEGRATION"):
108+
return # test failing on Travis CI
109+
instagram_user = InstagramUser(username)
110+
assert instagram_user.user_data
111+
assert isinstance(instagram_user.user_data, dict)
112+
assert instagram_user.username == username
113+
if username != "github":
114+
return
115+
assert instagram_user.fullname == "GitHub"
116+
assert instagram_user.biography == "Built for developers."
117+
assert instagram_user.number_of_posts > 150
118+
assert instagram_user.number_of_followers > 120000
119+
assert instagram_user.number_of_followings > 15
120+
assert instagram_user.email == "[email protected]"
121+
assert instagram_user.website == "https://github.com/readme"
122+
assert instagram_user.profile_picture_url.startswith("https://instagram.")
123+
assert instagram_user.is_verified is True
124+
assert instagram_user.is_private is False
125+
126+
127+
if __name__ == "__main__":
128+
import doctest
129+
130+
doctest.testmod()
131+
instagram_user = InstagramUser("github")
132+
print(instagram_user)
133+
print(f"{instagram_user.number_of_posts = }")
134+
print(f"{instagram_user.number_of_followers = }")
135+
print(f"{instagram_user.number_of_followings = }")
136+
print(f"{instagram_user.email = }")
137+
print(f"{instagram_user.website = }")
138+
print(f"{instagram_user.profile_picture_url = }")
139+
print(f"{instagram_user.is_verified = }")
140+
print(f"{instagram_user.is_private = }")

0 commit comments

Comments
 (0)