Skip to content

Commit b774929

Browse files
committed
Initial commit.
0 parents  commit b774929

File tree

6 files changed

+468
-0
lines changed

6 files changed

+468
-0
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
.envrc
2+
.venv/
3+
__pycache__/

devopsdays.py

+97
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import re
2+
3+
import dateparser
4+
import requests
5+
from bs4 import BeautifulSoup
6+
7+
def get(url):
8+
res = requests.get(url)
9+
return BeautifulSoup(res.text, 'html.parser')
10+
11+
12+
def parse_events():
13+
root = get('https://www.devopsdays.org/events/')
14+
for elm in root.select('.col-md-12 .row')[1].find_all('a'):
15+
yield elm['href']
16+
17+
18+
def parse_open_cfps():
19+
root = get('https://www.devopsdays.org/speaking/')
20+
for row in root.select('table.sortable tbody tr'):
21+
yield {
22+
'Location': row.find('a').string,
23+
'Conference URL': 'https://www.devopsdays.org' + row.find('a')['href'],
24+
'CFP End Date': dateparser.parse(row.find_all('td')[1].string.strip()),
25+
'Conference Start Date': dateparser.parse(row.find_all('td')[2].string.strip()).date(),
26+
}
27+
28+
29+
def parse_event(url):
30+
root = get(url+'welcome/')
31+
32+
cfp_nav = None
33+
for nav in root.select('.nav-link'):
34+
nav_text = str(nav.string).lower()
35+
if 'propose' in nav_text or 'cfp' in nav_text:
36+
cfp_nav = nav
37+
break
38+
if cfp_nav is None:
39+
propose_elm = root.find('strong', string='Propose')
40+
if propose_elm:
41+
cfp_nav = propose_elm.parent.next_sibling.find('a')
42+
if cfp_nav is None:
43+
return None
44+
cfp_url = cfp_nav['href']
45+
if cfp_url.startswith('/'):
46+
cfp_url = f'https://www.devopsdays.org{cfp_url}'
47+
48+
49+
dates_elm = root.find('strong', string='Dates')
50+
if dates_elm:
51+
dates = dates_elm.parent.next_sibling.string.split('-')
52+
event_end = dateparser.parse(dates[-1]).date()
53+
else:
54+
dates = root.select('.welcome-page-date')[0].contents[0]
55+
# Looks like "April 9 - 10, 2019"
56+
md = re.match(r'^(\S+) ([ 0-9-]+), (\d+)$', dates)
57+
if md:
58+
month, days, year = md.group(1, 2, 3)
59+
if '-' in days:
60+
start_day, end_day = days.split('-')
61+
else:
62+
start_day = end_day = days
63+
event_end = dateparser.parse(f'{month} {end_day}, {year}').date()
64+
if int(start_day) > int(end_day):
65+
event_end = event_end.replace(month=event_end.month+1)
66+
else:
67+
raise ValueError(f'Unable to find end date in {url}')
68+
69+
name_parts = root.select('.welcome-page')[0].string.split()
70+
name_parts[0] = name_parts[0].capitalize()
71+
name = ' '.join(name_parts)
72+
73+
return {
74+
'Conference Name': name,
75+
'CFP URL': cfp_url,
76+
'Conference End Date': event_end,
77+
'Tags': ['devops', 'devopsdays'],
78+
}
79+
80+
81+
def scrape():
82+
for data in parse_open_cfps():
83+
evt_data = parse_event(data['Conference URL'])
84+
if evt_data is None:
85+
continue
86+
data.update(evt_data)
87+
# Papercall is already handled.
88+
if 'papercall.io' in data['CFP URL']:
89+
continue
90+
yield data
91+
92+
if __name__ == '__main__':
93+
# print(parse_event('https://www.devopsdays.org/events/2019-indianapolis/'))
94+
# for d in parse_open_cfps():
95+
# print(d)
96+
for d in scrape():
97+
print(d)

main.py

+78
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
import itertools
2+
from datetime import date, datetime
3+
4+
import devopsdays
5+
import papercall
6+
import models
7+
8+
def scrape_all():
9+
print('Scraping Papercall')
10+
yield from papercall.scrape()
11+
print('Scraping Devopsdays')
12+
yield from devopsdays.scrape()
13+
14+
15+
def sync_record(existing, fields):
16+
# Convert any needed fields:
17+
for key, value in fields.items():
18+
if isinstance(value, datetime):
19+
fields[key] = value.replace(microsecond=0, tzinfo=None).isoformat() + '.000Z'
20+
elif isinstance(value, date):
21+
fields[key] = value.isoformat()
22+
if not fields.get('Conference Start Date'):
23+
fields.pop('Conference Start Date')
24+
if not fields.get('Conference End Date'):
25+
fields.pop('Conference End Date')
26+
if not fields.get('Tags'):
27+
fields.pop('Tags')
28+
29+
# No existing verison, create it.
30+
if existing is None:
31+
conf = models.Conference(**fields)
32+
print(f'Creating {conf}')
33+
conf.save()
34+
else:
35+
# Check if a save is needed.
36+
do_update = False
37+
for key, value in fields.items():
38+
existing_value = existing.get(key)
39+
# Special case for tags, they need to be sorted to check.
40+
if key == 'Tags' and value and existing_value:
41+
if sorted(value) != sorted(existing_value):
42+
print('{} {} {}'.format(key, repr(value), repr(existing_value)))
43+
do_update = True
44+
break
45+
else:
46+
continue
47+
48+
# Special case, none and '' are okay.
49+
if value == '' and existing_value is None:
50+
continue
51+
52+
if value != existing_value:
53+
print('{} {} {}'.format(key, repr(value), repr(existing_value)))
54+
do_update = True
55+
break
56+
if do_update:
57+
print(f'Updating {existing}')
58+
existing.update(fields)
59+
existing.save()
60+
61+
62+
def sync_all():
63+
# Fetch all the conferences into a local cache.
64+
conferences = {}
65+
for conf in models.Conference.fetch_all():
66+
conferences[conf['CFP URL']] = conf
67+
68+
# Run the scrapes and syncs.
69+
for fields in scrape_all():
70+
sync_record(conferences.get(fields['CFP URL']), fields)
71+
72+
73+
def main():
74+
sync_all()
75+
76+
77+
if __name__ == '__main__':
78+
main()

models.py

+83
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
import os
2+
from datetime import datetime
3+
4+
import airtable
5+
6+
7+
class AirtableModel(dict):
8+
class AirtablePropety:
9+
def __get__(_self, _instance, owner):
10+
if not hasattr(owner, '_db'):
11+
if not owner.table_name:
12+
raise ValueError(f'{owner} does not define table_name')
13+
owner._db = airtable.Airtable(os.environ['AIRTABLE_BASE_KEY'], owner.table_name)
14+
return owner._db
15+
16+
table_name = None
17+
db = AirtablePropety()
18+
19+
def __init__(self, airtable_id=None, **fields):
20+
self.airtable_id = airtable_id
21+
super().__init__(fields)
22+
23+
@classmethod
24+
def fetch(cls, **query):
25+
if len(query) != 1:
26+
raise ValueError(f'Invalid fetch query: {query}')
27+
key, value = list(query.items())[0]
28+
key = key.replace('_', ' ')
29+
record = cls.db.match(key, value)
30+
return cls(airtable_id=record.get('id'), **record.get('fields', {}))
31+
32+
@classmethod
33+
def fetch_all(cls):
34+
for page in cls.db.get_iter():
35+
for record in page:
36+
yield cls(airtable_id=record.get('id'), **record.get('fields', {}))
37+
38+
def save(self):
39+
if self.airtable_id:
40+
self.db.update(self.airtable_id, self)
41+
else:
42+
record = self.db.insert(self)
43+
self.airtable_id = record['id']
44+
45+
46+
class Conference(AirtableModel):
47+
table_name = 'Conferences'
48+
49+
def __str__(self):
50+
label = self.get('Conference Name')
51+
if not label:
52+
label = self['CFP URL']
53+
return f'Conference: {label}'
54+
55+
def save(self):
56+
# If we didn't have a CFP Start Date, just assume it's today.
57+
self.setdefault('CFP Start Date', str(datetime.utcnow().date()))
58+
59+
# Handle the tags value.
60+
tags = self.pop('Tags', [])
61+
try:
62+
super().save()
63+
finally:
64+
# Restore it after the save
65+
self['Tags'] = tags
66+
# Update any new tags.
67+
for t in tags:
68+
tag = Tag.fetch(Tag=t)
69+
if self.airtable_id not in tag.get('Conference', []):
70+
tag['Tag'] = t
71+
tag.setdefault('Conference', [])
72+
tag['Conference'].append(self.airtable_id)
73+
tag.save()
74+
# Remove any old tags.
75+
for t in self.db.get(self.airtable_id)['fields'].get('Tags', []):
76+
if t not in tags:
77+
tag = Tag.fetch(Tag=t)
78+
tag['Conferences'].delete(self.airtable_id)
79+
tag.save()
80+
81+
82+
class Tag(AirtableModel):
83+
table_name = 'Conference Tags'

0 commit comments

Comments
 (0)