File tree 3 files changed +86
-5
lines changed
3 files changed +86
-5
lines changed Original file line number Diff line number Diff line change 4
4
import devopsdays
5
5
import papercall
6
6
import models
7
+ import sessionize
7
8
8
9
def scrape_all ():
9
10
print ('Scraping Papercall' )
10
11
yield from papercall .scrape ()
11
12
print ('Scraping Devopsdays' )
12
13
yield from devopsdays .scrape ()
14
+ print ('Scraping Sessionize' )
15
+ yield from sessionize .scrape ()
13
16
14
17
15
18
def sync_record (existing , fields ):
@@ -20,11 +23,11 @@ def sync_record(existing, fields):
20
23
elif isinstance (value , date ):
21
24
fields [key ] = value .isoformat ()
22
25
if not fields .get ('Conference Start Date' ):
23
- fields .pop ('Conference Start Date' )
26
+ fields .pop ('Conference Start Date' , None )
24
27
if not fields .get ('Conference End Date' ):
25
- fields .pop ('Conference End Date' )
28
+ fields .pop ('Conference End Date' , None )
26
29
if not fields .get ('Tags' ):
27
- fields .pop ('Tags' )
30
+ fields .pop ('Tags' , None )
28
31
29
32
# No existing verison, create it.
30
33
if existing is None :
Original file line number Diff line number Diff line change 2
2
import requests
3
3
from bs4 import BeautifulSoup
4
4
5
+ import twitter_utils
5
6
6
7
def get (url ):
7
8
res = requests .get (url )
@@ -17,6 +18,15 @@ def find_navy_section(root, label):
17
18
def parse_event (url ):
18
19
root = get (url )
19
20
21
+ if root .find ('span' , string = 'Speaker Profile' ):
22
+ return None
23
+
24
+ if 'Log in' in root .find ('title' ).string :
25
+ return None
26
+
27
+ if '@ Sessionize.com' not in root .find ('title' ).string :
28
+ return None
29
+
20
30
data = {
21
31
'Conference Name' : root .select ('.ibox-title h4' )[0 ].string ,
22
32
'CFP URL' : url ,
@@ -66,6 +76,26 @@ def parse_event(url):
66
76
67
77
return data
68
78
79
+
80
+ def find_events ():
81
+ seen_urls = set ()
82
+ for url in twitter_utils .search_for_url ('sessionize.com' ):
83
+ # Skip the queryparams and downcase it.
84
+ clean_url = url .split ('?' )[0 ].lower ().rstrip ('/' )
85
+ if clean_url in seen_urls :
86
+ continue
87
+ if '/api/' in clean_url :
88
+ continue
89
+ evt = parse_event (clean_url )
90
+ if evt is not None :
91
+ yield evt
92
+ seen_urls .add (clean_url )
93
+
94
+
95
+ def scrape ():
96
+ yield from find_events ()
97
+
98
+
69
99
if __name__ == '__main__' :
70
- print ( parse_event ( 'https://sessionize.com/mixit19' ))
71
- print (parse_event ( 'https://sessionize.com/blockchain-saturday-Utah-v2' ) )
100
+ for d in find_events ():
101
+ print (d )
Original file line number Diff line number Diff line change
1
+ import os
2
+
3
+ import requests
4
+ import tweepy
5
+
6
+ auth = tweepy .OAuthHandler (os .environ ['TWITTER_CONSUMER_KEY' ], os .environ ['TWITTER_CONSUMER_SECRET' ])
7
+ auth .set_access_token (os .environ ['TWITTER_ACCESS_KEY' ], os .environ ['TWITTER_ACCESS_SECRET' ])
8
+
9
+ api = tweepy .API (auth )
10
+
11
+ _expand_cache = {}
12
+
13
+ def expand_url (url ):
14
+ expanded = _expand_cache .get (url )
15
+ if expanded is not None :
16
+ return expanded
17
+ expanded = requests .head (url , allow_redirects = True ).url
18
+ _expand_cache [url ] = expanded
19
+ return expanded
20
+
21
+
22
+ def search_for_url (query , total = 1000 ):
23
+ max_id = None
24
+ last_max_id = None
25
+ count = 0
26
+ while count < total :
27
+ for tweet in api .search (q = query , count = 100 , max_id = max_id , result_type = 'recent' ):
28
+ count += 1
29
+ if max_id :
30
+ max_id = min (max_id , tweet .id )
31
+ else :
32
+ max_id = tweet .id
33
+ for url in tweet .entities ['urls' ]:
34
+ # Twitter only expands their own shortener so get event more.
35
+ truly_expanded_url = url ['expanded_url' ]
36
+ if query not in truly_expanded_url :
37
+ truly_expanded_url = expand_url (truly_expanded_url )
38
+ if query in truly_expanded_url :
39
+ yield truly_expanded_url
40
+ # Did we run of Tweets?
41
+ if last_max_id == max_id :
42
+ break
43
+ last_max_id = max_id
44
+
45
+
46
+ if __name__ == '__main__' :
47
+ for url in search_for_url ('sessionize.com' ):
48
+ print (url )
You can’t perform that action at this time.
0 commit comments