Skip to content

Commit 49efe60

Browse files
committed
Add a parser for the Mozilla ical feed.
1 parent 2bef35b commit 49efe60

File tree

4 files changed

+141
-1
lines changed

4 files changed

+141
-1
lines changed

main.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import linux_foundation
99
import seecfp
1010
import lwn
11+
import mozilla_calendar
1112

1213
def scrape_all():
1314
print('Scraping Papercall')
@@ -22,6 +23,8 @@ def scrape_all():
2223
yield from seecfp.scrape()
2324
print('Scraping LWN CFP Calendar')
2425
yield from lwn.scrape()
26+
print('Scraping Mozilla Calendar')
27+
yield from mozilla_calendar.scrape()
2528

2629

2730
def sync_record(existing, fields):

models.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from datetime import datetime
33

44
import airtable
5+
import dateparser
56

67

78
class AirtableModel(dict):
@@ -43,6 +44,14 @@ def save(self):
4344
self.airtable_id = record['id']
4445

4546

47+
def datetime_lt(a, b):
48+
if isinstance(a, (str, bytes)):
49+
a = dateparser.parse(a)
50+
if isinstance(b, (str, bytes)):
51+
b = dateparser.parse(b)
52+
return a.replace(tzinfo=None) < b.replace(tzinfo=None)
53+
54+
4655
class Conference(AirtableModel):
4756
table_name = 'Conferences'
4857

@@ -54,7 +63,14 @@ def __str__(self):
5463

5564
def save(self):
5665
# If we didn't have a CFP Start Date, just assume it's today.
57-
self.setdefault('CFP Start Date', str(datetime.utcnow().date()))
66+
if 'CFP Start Date' not in self:
67+
if 'CFP End Date' in self and datetime_lt(self['CFP End Date'], datetime.now()):
68+
d = self['CFP End Date']
69+
if isinstance(d, (str, bytes)):
70+
d = str(dateparser.parse(d).date())
71+
self['CFP Start Date'] = d
72+
else:
73+
self['CFP Start Date'] = str(datetime.utcnow().date())
5874

5975
# Clear computed fields.
6076
end_date_only = self.pop('CFP End Date (Only)', None)

mozilla_calendar.py

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
# https://calendar.google.com/calendar/ical/mozilla.com_tptb36ac7eijerilfnf6c1onfo%40group.calendar.google.com/public/basic.ics
2+
import re
3+
from datetime import datetime
4+
5+
import dateparser
6+
import requests
7+
import ics
8+
from urlextract import URLExtract
9+
10+
import sessionize
11+
12+
FLAG_A = ord('🇦')
13+
FLAG_Z = FLAG_A + 26
14+
FLAG_OFFSET = FLAG_A - ord('A')
15+
URL_EXTRACTOR = URLExtract()
16+
17+
18+
def fetch_cal():
19+
url = 'https://calendar.google.com/calendar/ical/mozilla.com_tptb36ac7eijerilfnf6c1onfo%40group.calendar.google.com/public/basic.ics'
20+
return ics.Calendar(requests.get(url).text)
21+
22+
23+
def convert_flags(s):
24+
ords = [ord(c) for c in s]
25+
return ''.join(chr(c - FLAG_OFFSET) if FLAG_A <= c <= FLAG_Z else chr(c) for c in ords)
26+
27+
28+
def parse_event_url(evt):
29+
links = URL_EXTRACTOR.find_urls(evt.description)
30+
if links:
31+
return links[0]
32+
33+
34+
def parse_date(raw_date, relative_to):
35+
s = {'PREFER_DATES_FROM': 'future', 'RELATIVE_BASE': relative_to.replace(tzinfo=None)}
36+
37+
md = re.search(r'^(\w+) (\d+)\s*-\s*(\w+) (\d+)(.*)$', raw_date)
38+
if md:
39+
return (
40+
dateparser.parse(f'{md.group(1)} {md.group(2)}', settings=s).date(),
41+
dateparser.parse(f'{md.group(3)} {md.group(4)}', settings=s).date(),
42+
md.group(5),
43+
)
44+
md = re.search(r'^(\w+) (\d+)\s*-\s*(\d+)(.*)$', raw_date)
45+
if md:
46+
return (
47+
dateparser.parse(f'{md.group(1)} {md.group(2)}', settings=s).date(),
48+
dateparser.parse(f'{md.group(1)} {md.group(3)}', settings=s).date(),
49+
md.group(4),
50+
)
51+
md = re.search(r'^(\w+) (\d+)(.*)$', raw_date)
52+
if md:
53+
d = dateparser.parse(f'{md.group(1)} {md.group(2)}', settings=s).date()
54+
return (d, d, md.group(3))
55+
return (None, None, raw_date)
56+
57+
58+
def parse_event_name(label, relative_to):
59+
label = convert_flags(label)
60+
md = re.search(r'^(.*) \((.*?)\)$', label)
61+
if not md:
62+
return {
63+
'Conference Name': label.strip(),
64+
}
65+
name, dates_and_location = md.group(1, 2)
66+
# Try to filter out the word CFP.
67+
name = name.replace('CFP', '')
68+
name = re.sub(r'(^| ):( |$)', ' ', name)
69+
name = re.sub(r'\s+', ' ', name).strip()
70+
# Parse dates.
71+
start_date, end_date, location = parse_date(dates_and_location, relative_to)
72+
# Clean up the location.
73+
location = location.lstrip(',').strip()
74+
evt = {
75+
'Conference Name': name,
76+
}
77+
if start_date:
78+
evt['Conference Start Date'] = start_date
79+
if end_date:
80+
evt['Conference End Date'] = end_date
81+
if location:
82+
evt['Location'] = location
83+
return evt
84+
85+
86+
def parse_events(cal):
87+
# Skip anything that closed more than a year ago.
88+
now = datetime.utcnow()
89+
cutoff = now.replace(year=now.year-1, tzinfo=None)
90+
91+
for evt in cal.events:
92+
if evt.begin.datetime.replace(tzinfo=None) < cutoff:
93+
continue
94+
data = parse_event_name(evt.name, evt.begin.datetime)
95+
if evt.location:
96+
data['Location'] = evt.location
97+
data['CFP End Date'] = evt.begin.datetime.replace(tzinfo=None)
98+
url = parse_event_url(evt)
99+
if url:
100+
data['Conference URL'] = data['CFP URL'] = url
101+
yield data
102+
103+
104+
def scrape():
105+
for evt in parse_events(fetch_cal()):
106+
if evt is None or 'CFP URL' not in evt:
107+
continue
108+
if 'papercall.io' in evt['CFP URL']:
109+
continue
110+
if 'sessionize.com' in evt['CFP URL']:
111+
s = sessionize.parse_event(evt['CFP URL'])
112+
if s:
113+
evt.update(s)
114+
yield evt
115+
116+
117+
if __name__ == '__main__':
118+
for e in scrape():
119+
print(e)

requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,5 @@ soupsieve==1.7.2
1616
tweepy==3.7.0
1717
tzlocal==1.5.1
1818
urllib3==1.24.1
19+
ics==0.4
20+
urlextract==0.8.3

0 commit comments

Comments
 (0)