Working sessionize module.

coderanger · coderanger · commit abc573c1e193 · 2019-01-20T06:54:35.000-08:00
diff --git a/main.py b/main.py
@@ -4,12 +4,15 @@
 import devopsdays
 import papercall
 import models
+import sessionize
 
 def scrape_all():
     print('Scraping Papercall')
     yield from papercall.scrape()
     print('Scraping Devopsdays')
     yield from devopsdays.scrape()
+    print('Scraping Sessionize')
+    yield from sessionize.scrape()
 
 
 def sync_record(existing, fields):
@@ -20,11 +23,11 @@ def sync_record(existing, fields):
         elif isinstance(value, date):
             fields[key] = value.isoformat()
     if not fields.get('Conference Start Date'):
-        fields.pop('Conference Start Date')
+        fields.pop('Conference Start Date', None)
     if not fields.get('Conference End Date'):
-        fields.pop('Conference End Date')
+        fields.pop('Conference End Date', None)
     if not fields.get('Tags'):
-        fields.pop('Tags')
+        fields.pop('Tags', None)
 
     # No existing verison, create it.
     if existing is None:
diff --git a/sessionize.py b/sessionize.py
@@ -2,6 +2,7 @@
 import requests
 from bs4 import BeautifulSoup
 
+import twitter_utils
 
 def get(url):
     res = requests.get(url)
@@ -17,6 +18,15 @@ def find_navy_section(root, label):
 def parse_event(url):
     root = get(url)
 
+    if root.find('span', string='Speaker Profile'):
+        return None
+
+    if 'Log in' in root.find('title').string:
+        return None
+
+    if '@ Sessionize.com' not in root.find('title').string:
+        return None
+
     data = {
         'Conference Name': root.select('.ibox-title h4')[0].string,
         'CFP URL': url,
@@ -66,6 +76,26 @@ def parse_event(url):
 
     return data
 
+
+def find_events():
+    seen_urls = set()
+    for url in twitter_utils.search_for_url('sessionize.com'):
+        # Skip the queryparams and downcase it.
+        clean_url = url.split('?')[0].lower().rstrip('/')
+        if clean_url in seen_urls:
+            continue
+        if '/api/' in clean_url:
+            continue
+        evt = parse_event(clean_url)
+        if evt is not None:
+            yield evt
+        seen_urls.add(clean_url)
+
+
+def scrape():
+    yield from find_events()
+
+
 if __name__ == '__main__':
-    print(parse_event('https://sessionize.com/mixit19'))
-    print(parse_event('https://sessionize.com/blockchain-saturday-Utah-v2'))
+    for d in find_events():
+        print(d)
diff --git a/twitter_utils.py b/twitter_utils.py
@@ -0,0 +1,48 @@
+import os
+
+import requests
+import tweepy
+
+auth = tweepy.OAuthHandler(os.environ['TWITTER_CONSUMER_KEY'], os.environ['TWITTER_CONSUMER_SECRET'])
+auth.set_access_token(os.environ['TWITTER_ACCESS_KEY'], os.environ['TWITTER_ACCESS_SECRET'])
+
+api = tweepy.API(auth)
+
+_expand_cache = {}
+
+def expand_url(url):
+    expanded = _expand_cache.get(url)
+    if expanded is not None:
+        return expanded
+    expanded = requests.head(url, allow_redirects=True).url
+    _expand_cache[url] = expanded
+    return expanded
+
+
+def search_for_url(query, total=1000):
+    max_id = None
+    last_max_id = None
+    count = 0
+    while count < total:
+        for tweet in api.search(q=query, count=100, max_id=max_id, result_type='recent'):
+            count += 1
+            if max_id:
+                max_id = min(max_id, tweet.id)
+            else:
+                max_id = tweet.id
+            for url in tweet.entities['urls']:
+                # Twitter only expands their own shortener so get event more.
+                truly_expanded_url = url['expanded_url']
+                if query not in truly_expanded_url:
+                    truly_expanded_url = expand_url(truly_expanded_url)
+                if query in truly_expanded_url:
+                    yield truly_expanded_url
+        # Did we run of Tweets?
+        if last_max_id == max_id:
+            break
+        last_max_id = max_id
+
+
+if __name__ == '__main__':
+    for url in search_for_url('sessionize.com'):
+        print(url)