Skip to content

Commit abc573c

Browse files
committed
Working sessionize module.
1 parent b774929 commit abc573c

File tree

3 files changed

+86
-5
lines changed

3 files changed

+86
-5
lines changed

main.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,15 @@
44
import devopsdays
55
import papercall
66
import models
7+
import sessionize
78

89
def scrape_all():
910
print('Scraping Papercall')
1011
yield from papercall.scrape()
1112
print('Scraping Devopsdays')
1213
yield from devopsdays.scrape()
14+
print('Scraping Sessionize')
15+
yield from sessionize.scrape()
1316

1417

1518
def sync_record(existing, fields):
@@ -20,11 +23,11 @@ def sync_record(existing, fields):
2023
elif isinstance(value, date):
2124
fields[key] = value.isoformat()
2225
if not fields.get('Conference Start Date'):
23-
fields.pop('Conference Start Date')
26+
fields.pop('Conference Start Date', None)
2427
if not fields.get('Conference End Date'):
25-
fields.pop('Conference End Date')
28+
fields.pop('Conference End Date', None)
2629
if not fields.get('Tags'):
27-
fields.pop('Tags')
30+
fields.pop('Tags', None)
2831

2932
# No existing verison, create it.
3033
if existing is None:

sessionize.py

+32-2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import requests
33
from bs4 import BeautifulSoup
44

5+
import twitter_utils
56

67
def get(url):
78
res = requests.get(url)
@@ -17,6 +18,15 @@ def find_navy_section(root, label):
1718
def parse_event(url):
1819
root = get(url)
1920

21+
if root.find('span', string='Speaker Profile'):
22+
return None
23+
24+
if 'Log in' in root.find('title').string:
25+
return None
26+
27+
if '@ Sessionize.com' not in root.find('title').string:
28+
return None
29+
2030
data = {
2131
'Conference Name': root.select('.ibox-title h4')[0].string,
2232
'CFP URL': url,
@@ -66,6 +76,26 @@ def parse_event(url):
6676

6777
return data
6878

79+
80+
def find_events():
81+
seen_urls = set()
82+
for url in twitter_utils.search_for_url('sessionize.com'):
83+
# Skip the queryparams and downcase it.
84+
clean_url = url.split('?')[0].lower().rstrip('/')
85+
if clean_url in seen_urls:
86+
continue
87+
if '/api/' in clean_url:
88+
continue
89+
evt = parse_event(clean_url)
90+
if evt is not None:
91+
yield evt
92+
seen_urls.add(clean_url)
93+
94+
95+
def scrape():
96+
yield from find_events()
97+
98+
6999
if __name__ == '__main__':
70-
print(parse_event('https://sessionize.com/mixit19'))
71-
print(parse_event('https://sessionize.com/blockchain-saturday-Utah-v2'))
100+
for d in find_events():
101+
print(d)

twitter_utils.py

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import os
2+
3+
import requests
4+
import tweepy
5+
6+
auth = tweepy.OAuthHandler(os.environ['TWITTER_CONSUMER_KEY'], os.environ['TWITTER_CONSUMER_SECRET'])
7+
auth.set_access_token(os.environ['TWITTER_ACCESS_KEY'], os.environ['TWITTER_ACCESS_SECRET'])
8+
9+
api = tweepy.API(auth)
10+
11+
_expand_cache = {}
12+
13+
def expand_url(url):
14+
expanded = _expand_cache.get(url)
15+
if expanded is not None:
16+
return expanded
17+
expanded = requests.head(url, allow_redirects=True).url
18+
_expand_cache[url] = expanded
19+
return expanded
20+
21+
22+
def search_for_url(query, total=1000):
23+
max_id = None
24+
last_max_id = None
25+
count = 0
26+
while count < total:
27+
for tweet in api.search(q=query, count=100, max_id=max_id, result_type='recent'):
28+
count += 1
29+
if max_id:
30+
max_id = min(max_id, tweet.id)
31+
else:
32+
max_id = tweet.id
33+
for url in tweet.entities['urls']:
34+
# Twitter only expands their own shortener so get event more.
35+
truly_expanded_url = url['expanded_url']
36+
if query not in truly_expanded_url:
37+
truly_expanded_url = expand_url(truly_expanded_url)
38+
if query in truly_expanded_url:
39+
yield truly_expanded_url
40+
# Did we run of Tweets?
41+
if last_max_id == max_id:
42+
break
43+
last_max_id = max_id
44+
45+
46+
if __name__ == '__main__':
47+
for url in search_for_url('sessionize.com'):
48+
print(url)

0 commit comments

Comments
 (0)