Add a parser for the Mozilla ical feed.

coderanger · coderanger · commit 49efe60ded5a · 2019-01-26T20:48:02.000-08:00
diff --git a/main.py b/main.py
@@ -8,6 +8,7 @@
 import linux_foundation
 import seecfp
 import lwn
+import mozilla_calendar
 
 def scrape_all():
     print('Scraping Papercall')
@@ -22,6 +23,8 @@ def scrape_all():
     yield from seecfp.scrape()
     print('Scraping LWN CFP Calendar')
     yield from lwn.scrape()
+    print('Scraping Mozilla Calendar')
+    yield from mozilla_calendar.scrape()
 
 
 def sync_record(existing, fields):
diff --git a/models.py b/models.py
@@ -2,6 +2,7 @@
 from datetime import datetime
 
 import airtable
+import dateparser
 
 
 class AirtableModel(dict):
@@ -43,6 +44,14 @@ def save(self):
             self.airtable_id = record['id']
 
 
+def datetime_lt(a, b):
+    if isinstance(a, (str, bytes)):
+        a = dateparser.parse(a)
+    if isinstance(b, (str, bytes)):
+        b = dateparser.parse(b)
+    return a.replace(tzinfo=None) < b.replace(tzinfo=None)
+
+
 class Conference(AirtableModel):
     table_name = 'Conferences'
 
@@ -54,7 +63,14 @@ def __str__(self):
 
     def save(self):
         # If we didn't have a CFP Start Date, just assume it's today.
-        self.setdefault('CFP Start Date', str(datetime.utcnow().date()))
+        if 'CFP Start Date' not in self:
+            if 'CFP End Date' in self and datetime_lt(self['CFP End Date'], datetime.now()):
+                d = self['CFP End Date']
+                if isinstance(d, (str, bytes)):
+                    d = str(dateparser.parse(d).date())
+                self['CFP Start Date'] = d
+            else:
+                self['CFP Start Date'] = str(datetime.utcnow().date())
 
         # Clear computed fields.
         end_date_only = self.pop('CFP End Date (Only)', None)
diff --git a/mozilla_calendar.py b/mozilla_calendar.py
@@ -0,0 +1,119 @@
+# https://calendar.google.com/calendar/ical/mozilla.com_tptb36ac7eijerilfnf6c1onfo%40group.calendar.google.com/public/basic.ics
+import re
+from datetime import datetime
+
+import dateparser
+import requests
+import ics
+from urlextract import URLExtract
+
+import sessionize
+
+FLAG_A = ord('🇦')
+FLAG_Z = FLAG_A + 26
+FLAG_OFFSET = FLAG_A - ord('A')
+URL_EXTRACTOR = URLExtract()
+
+
+def fetch_cal():
+    url = 'https://calendar.google.com/calendar/ical/mozilla.com_tptb36ac7eijerilfnf6c1onfo%40group.calendar.google.com/public/basic.ics'
+    return ics.Calendar(requests.get(url).text)
+
+
+def convert_flags(s):
+    ords = [ord(c) for c in s]
+    return ''.join(chr(c - FLAG_OFFSET) if FLAG_A <= c <= FLAG_Z else chr(c) for c in ords)
+
+
+def parse_event_url(evt):
+    links = URL_EXTRACTOR.find_urls(evt.description)
+    if links:
+        return links[0]
+
+
+def parse_date(raw_date, relative_to):
+    s = {'PREFER_DATES_FROM': 'future', 'RELATIVE_BASE': relative_to.replace(tzinfo=None)}
+
+    md = re.search(r'^(\w+) (\d+)\s*-\s*(\w+) (\d+)(.*)$', raw_date)
+    if md:
+        return (
+            dateparser.parse(f'{md.group(1)} {md.group(2)}', settings=s).date(),
+            dateparser.parse(f'{md.group(3)} {md.group(4)}', settings=s).date(),
+            md.group(5),
+        )
+    md = re.search(r'^(\w+) (\d+)\s*-\s*(\d+)(.*)$', raw_date)
+    if md:
+        return (
+            dateparser.parse(f'{md.group(1)} {md.group(2)}', settings=s).date(),
+            dateparser.parse(f'{md.group(1)} {md.group(3)}', settings=s).date(),
+            md.group(4),
+        )
+    md = re.search(r'^(\w+) (\d+)(.*)$', raw_date)
+    if md:
+        d = dateparser.parse(f'{md.group(1)} {md.group(2)}', settings=s).date()
+        return (d, d, md.group(3))
+    return (None, None, raw_date)
+
+
+def parse_event_name(label, relative_to):
+    label = convert_flags(label)
+    md = re.search(r'^(.*) \((.*?)\)$', label)
+    if not md:
+        return {
+            'Conference Name': label.strip(),
+        }
+    name, dates_and_location = md.group(1, 2)
+    # Try to filter out the word CFP.
+    name = name.replace('CFP', '')
+    name = re.sub(r'(^| ):( |$)', ' ', name)
+    name = re.sub(r'\s+', ' ', name).strip()
+    # Parse dates.
+    start_date, end_date, location = parse_date(dates_and_location, relative_to)
+    # Clean up the location.
+    location = location.lstrip(',').strip()
+    evt = {
+        'Conference Name': name,
+    }
+    if start_date:
+        evt['Conference Start Date'] = start_date
+    if end_date:
+        evt['Conference End Date'] = end_date
+    if location:
+        evt['Location'] = location
+    return evt
+
+
+def parse_events(cal):
+    # Skip anything that closed more than a year ago.
+    now = datetime.utcnow()
+    cutoff = now.replace(year=now.year-1, tzinfo=None)
+
+    for evt in cal.events:
+        if evt.begin.datetime.replace(tzinfo=None) < cutoff:
+            continue
+        data = parse_event_name(evt.name, evt.begin.datetime)
+        if evt.location:
+            data['Location'] = evt.location
+        data['CFP End Date'] = evt.begin.datetime.replace(tzinfo=None)
+        url = parse_event_url(evt)
+        if url:
+            data['Conference URL'] = data['CFP URL'] = url
+        yield data
+
+
+def scrape():
+    for evt in parse_events(fetch_cal()):
+        if evt is None or 'CFP URL' not in evt:
+            continue
+        if 'papercall.io' in evt['CFP URL']:
+            continue
+        if 'sessionize.com' in evt['CFP URL']:
+            s = sessionize.parse_event(evt['CFP URL'])
+            if s:
+                evt.update(s)
+        yield evt
+
+
+if __name__ == '__main__':
+    for e in scrape():
+        print(e)
diff --git a/requirements.txt b/requirements.txt
@@ -16,3 +16,5 @@ soupsieve==1.7.2
 tweepy==3.7.0
 tzlocal==1.5.1
 urllib3==1.24.1
+ics==0.4
+urlextract==0.8.3