Skip to content

Commit a1ccb10

Browse files
committed
Fixed unintentionally omitted files
1 parent 5798b26 commit a1ccb10

File tree

3 files changed

+175
-1
lines changed

3 files changed

+175
-1
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,4 +116,4 @@ venv.bak/
116116
**/pmc_custom_license/
117117

118118
# Metadata
119-
**/metadata/
119+
**/datasets/metadata/

scripts/metadata/json_to_rdf.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import os
2+
from zipfile import ZipFile
3+
4+
import jsonasobj
5+
from rdflib import Graph
6+
7+
from scripts.metadata import METADATA_DIR, CONTEXT_DIR, DATASETS_DIR
8+
9+
CONTEXT = os.path.join(CONTEXT_DIR, 'metadata.context.json')
10+
context_j = jsonasobj.load(CONTEXT)
11+
BASE = context_j['@context']['@base']
12+
13+
n_converted = 0
14+
for fname in os.listdir(METADATA_DIR):
15+
basename, ext = os.path.splitext(fname)
16+
if ext == '.json':
17+
g = Graph()
18+
g.parse(os.path.join(METADATA_DIR, fname), format="json-ld", context=CONTEXT, base=BASE)
19+
g.serialize(os.path.join(METADATA_DIR, basename + '.ttl'), format='ttl')
20+
n_converted += 1
21+
22+
print(f"*** {n_converted} files converted ***")
23+
24+
print(f"** Zipping metadata directory **")
25+
26+
files_per_zipfile = 20000
27+
zipfile_series = 0
28+
nfiles = 0
29+
zf = None
30+
31+
def new_zipfile(current_zipfile: ZipFile) -> ZipFile:
32+
global zipfile_series
33+
34+
if current_zipfile:
35+
current_zipfile.close()
36+
zipfile_series += 1
37+
zf_name = f"metadata{zipfile_series}.zip"
38+
print(f"\tWriting {zf_name}")
39+
zf = ZipFile(os.path.join(DATASETS_DIR, zf_name), "w")
40+
zf.write(METADATA_DIR)
41+
return zf
42+
43+
44+
for file in os.listdir(METADATA_DIR):
45+
if zf is None or nfiles > files_per_zipfile:
46+
zf = new_zipfile(zf)
47+
nfiles = 0
48+
zf.write(os.path.join(METADATA_DIR, file))
49+
nfiles += 1
50+
51+
zf.close()

scripts/metadata/metadata_to_json.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
"""
2+
Process the all_sources_metadata file producing both a JSON and an RDF rendering in the metadata output directory
3+
"""
4+
import os
5+
import string
6+
from csv import DictReader
7+
from typing import Optional, Set, Dict, List, Tuple
8+
9+
from jsonasobj import as_json, JsonObj
10+
from rdflib import Namespace
11+
12+
from scripts.metadata import prefixes, DATASETS_DIR, SOURCE_DIR, METADATA_DIR
13+
14+
MISSING_FILE = 'MISSING'
15+
16+
# Root of datasets
17+
data_subdirs = [e for e in os.listdir(DATASETS_DIR) if e != 'metadata' and '.' not in e]
18+
subdir_contents: Dict[str, Set[str]] = dict()
19+
20+
# TODO: Can we find WHO Covidence?
21+
IDENTIFIERS: List[Tuple[str, Namespace]] = [
22+
('pubmed_id', prefixes.PUBMED),
23+
('pmcid', prefixes.PMC),
24+
('doi', prefixes.DOI),
25+
('Microsoft Academic Paper Id', prefixes.MS_ACADEMIC)
26+
]
27+
28+
SUBDIR_MAP: Dict[str, str] = {
29+
"biorxiv_medrxiv": "bioRxiv-medRxiv",
30+
"comm_use_subset": "Commercial",
31+
"pmc_custom_license": "PMC",
32+
"noncomm_use_subset": "Non-comercial"
33+
}
34+
35+
def generate_identifier(entry: JsonObj) -> None:
36+
"""
37+
Generate an "id" entry for entry
38+
:param entry: metadata entry
39+
"""
40+
if hasattr(entry, 'sha'):
41+
row_j.id = row_j.sha
42+
else:
43+
for identifier, namespace in IDENTIFIERS:
44+
if hasattr(entry, identifier):
45+
row_j.id = namespace[entry[identifier].split()[0]]
46+
break
47+
48+
49+
def normalize_namespaces(entry: JsonObj) -> None:
50+
""" Some of the identifiers are actual multiple occurrences. Instead of representing this as a list, the
51+
metadata represents this as space separated values. Turn them into lists if needed
52+
"""
53+
for identifier, _ in IDENTIFIERS:
54+
ids = getattr(entry, identifier, None)
55+
if ids and ' ' in ids:
56+
setattr(entry, identifier, ids.split())
57+
58+
59+
# Generate a list of all the files we know about
60+
for subdir in data_subdirs:
61+
for fname in os.listdir(os.path.join(DATASETS_DIR, subdir)):
62+
if fname[0] in string.hexdigits and fname.endswith(".json"):
63+
subdir_contents.setdefault(subdir, set()).add(fname)
64+
subdir_contents[MISSING_FILE] = set()
65+
66+
67+
def which_subdir(sha: str) -> Optional[str]:
68+
""" Determine which subset (if any) sha is represented in """
69+
fname = sha + '.json'
70+
for k, v in subdir_contents.items():
71+
if fname in v:
72+
subdir_contents[k].remove(fname)
73+
return k
74+
subdir_contents[MISSING_FILE].add(fname)
75+
return MISSING_FILE
76+
77+
78+
print("*** Starting Content ***")
79+
for subdir in data_subdirs:
80+
print(f"\t{subdir}: {len(subdir_contents[subdir])}")
81+
print()
82+
83+
# This loads the metadata file as a Python dictionary and then emits the first row in JSON
84+
with open(os.path.join(SOURCE_DIR, 'all_sources_metadata_2020-03-13.csv')) as f:
85+
reader = DictReader(f)
86+
87+
known_sources = dict()
88+
known_subdirs = dict()
89+
source_x_to_subdir = dict()
90+
91+
row_num = 0
92+
for row in reader:
93+
row_num += 1
94+
row_j = JsonObj(**{k: v for k, v in row.items() if v != ""})
95+
generate_identifier(row_j)
96+
normalize_namespaces(row_j)
97+
98+
if hasattr(row_j, "sha"):
99+
subdir = which_subdir(row_j.sha)
100+
if subdir in SUBDIR_MAP:
101+
row_j.fhir_link = SUBDIR_MAP[subdir] + '/' + row_j.sha
102+
103+
if hasattr(row_j, "authors"):
104+
row_j.authors = [a.trim() for a in row_j.authors.split(';')]
105+
with open(os.path.join(METADATA_DIR, f'e{row_num}.json'), 'w') as json_file:
106+
# print(f"***** Writing {row_num}.json")
107+
json_file.write(as_json(row_j))
108+
109+
known_sources.setdefault(row_j.source_x, 0)
110+
known_sources[row_j.source_x] += 1
111+
known_subdirs.setdefault(subdir, 0)
112+
known_subdirs[subdir] += 1
113+
114+
print(f"{row_num} entries written")
115+
print("\n*** File by source_x ***")
116+
print(known_sources)
117+
print("\n*** File by subdirectory ***")
118+
print(known_subdirs)
119+
print("\n*** Unreferenced files ***")
120+
for subdir in data_subdirs:
121+
rem_ents = len(subdir_contents[subdir])
122+
if rem_ents:
123+
print(f"\t{subdir}: {rem_ents}")

0 commit comments

Comments
 (0)