Fixed unintentionally omitted files

hsolbrig · hsolbrig · commit a1ccb10d238f · 2020-03-19T13:56:14.000-05:00
diff --git a/.gitignore b/.gitignore
@@ -116,4 +116,4 @@ venv.bak/
 **/pmc_custom_license/
 
 # Metadata
-**/metadata/
+**/datasets/metadata/
diff --git a/scripts/metadata/json_to_rdf.py b/scripts/metadata/json_to_rdf.py
@@ -0,0 +1,51 @@
+import os
+from zipfile import ZipFile
+
+import jsonasobj
+from rdflib import Graph
+
+from scripts.metadata import METADATA_DIR, CONTEXT_DIR, DATASETS_DIR
+
+CONTEXT = os.path.join(CONTEXT_DIR, 'metadata.context.json')
+context_j = jsonasobj.load(CONTEXT)
+BASE = context_j['@context']['@base']
+
+n_converted = 0
+for fname in os.listdir(METADATA_DIR):
+    basename, ext = os.path.splitext(fname)
+    if ext == '.json':
+        g = Graph()
+        g.parse(os.path.join(METADATA_DIR, fname), format="json-ld", context=CONTEXT, base=BASE)
+        g.serialize(os.path.join(METADATA_DIR, basename + '.ttl'), format='ttl')
+        n_converted += 1
+
+print(f"*** {n_converted} files converted ***")
+
+print(f"** Zipping metadata directory **")
+
+files_per_zipfile = 20000
+zipfile_series = 0
+nfiles = 0
+zf = None
+
+def new_zipfile(current_zipfile: ZipFile) -> ZipFile:
+    global zipfile_series
+
+    if current_zipfile:
+        current_zipfile.close()
+    zipfile_series += 1
+    zf_name = f"metadata{zipfile_series}.zip"
+    print(f"\tWriting {zf_name}")
+    zf = ZipFile(os.path.join(DATASETS_DIR, zf_name), "w")
+    zf.write(METADATA_DIR)
+    return zf
+
+
+for file in os.listdir(METADATA_DIR):
+    if zf is None or nfiles > files_per_zipfile:
+        zf = new_zipfile(zf)
+        nfiles = 0
+    zf.write(os.path.join(METADATA_DIR, file))
+    nfiles += 1
+
+zf.close()
diff --git a/scripts/metadata/metadata_to_json.py b/scripts/metadata/metadata_to_json.py
@@ -0,0 +1,123 @@
+"""
+Process the all_sources_metadata file producing both a JSON and an RDF rendering in the metadata output directory
+"""
+import os
+import string
+from csv import DictReader
+from typing import Optional, Set, Dict, List, Tuple
+
+from jsonasobj import as_json, JsonObj
+from rdflib import Namespace
+
+from scripts.metadata import prefixes, DATASETS_DIR, SOURCE_DIR, METADATA_DIR
+
+MISSING_FILE = 'MISSING'
+
+# Root of datasets
+data_subdirs = [e for e in os.listdir(DATASETS_DIR) if e != 'metadata' and '.' not in e]
+subdir_contents: Dict[str, Set[str]] = dict()
+
+# TODO: Can we find WHO Covidence?
+IDENTIFIERS: List[Tuple[str, Namespace]] = [
+    ('pubmed_id', prefixes.PUBMED),
+    ('pmcid', prefixes.PMC),
+    ('doi', prefixes.DOI),
+    ('Microsoft Academic Paper Id', prefixes.MS_ACADEMIC)
+]
+
+SUBDIR_MAP: Dict[str, str] = {
+    "biorxiv_medrxiv": "bioRxiv-medRxiv",
+    "comm_use_subset": "Commercial",
+    "pmc_custom_license": "PMC",
+    "noncomm_use_subset": "Non-comercial"
+}
+
+def generate_identifier(entry: JsonObj) -> None:
+    """
+    Generate an "id" entry for entry
+    :param entry: metadata entry
+    """
+    if hasattr(entry, 'sha'):
+        row_j.id = row_j.sha
+    else:
+        for identifier, namespace in IDENTIFIERS:
+            if hasattr(entry, identifier):
+                row_j.id = namespace[entry[identifier].split()[0]]
+                break
+
+
+def normalize_namespaces(entry: JsonObj) -> None:
+    """ Some of the identifiers are actual multiple occurrences.  Instead of representing this as a list, the
+    metadata represents this as space separated values.  Turn them into lists if needed
+    """
+    for identifier, _ in IDENTIFIERS:
+        ids = getattr(entry, identifier, None)
+        if ids and ' ' in ids:
+            setattr(entry, identifier, ids.split())
+
+
+# Generate a list of all the files we know about
+for subdir in data_subdirs:
+    for fname in os.listdir(os.path.join(DATASETS_DIR, subdir)):
+        if fname[0] in string.hexdigits and fname.endswith(".json"):
+            subdir_contents.setdefault(subdir, set()).add(fname)
+subdir_contents[MISSING_FILE] = set()
+
+
+def which_subdir(sha: str) -> Optional[str]:
+    """ Determine which subset (if any) sha is represented in """
+    fname = sha + '.json'
+    for k, v in subdir_contents.items():
+        if fname in v:
+            subdir_contents[k].remove(fname)
+            return k
+    subdir_contents[MISSING_FILE].add(fname)
+    return MISSING_FILE
+
+
+print("*** Starting Content ***")
+for subdir in data_subdirs:
+    print(f"\t{subdir}: {len(subdir_contents[subdir])}")
+print()
+
+# This loads the metadata file as a Python dictionary and then emits the first row in JSON
+with open(os.path.join(SOURCE_DIR, 'all_sources_metadata_2020-03-13.csv')) as f:
+    reader = DictReader(f)
+
+    known_sources = dict()
+    known_subdirs = dict()
+    source_x_to_subdir = dict()
+
+    row_num = 0
+    for row in reader:
+        row_num += 1
+        row_j = JsonObj(**{k: v for k, v in row.items() if v != ""})
+        generate_identifier(row_j)
+        normalize_namespaces(row_j)
+
+        if hasattr(row_j, "sha"):
+            subdir = which_subdir(row_j.sha)
+            if subdir in SUBDIR_MAP:
+                row_j.fhir_link = SUBDIR_MAP[subdir] + '/' + row_j.sha
+
+        if hasattr(row_j, "authors"):
+            row_j.authors = [a.trim() for a in row_j.authors.split(';')]
+        with open(os.path.join(METADATA_DIR, f'e{row_num}.json'), 'w') as json_file:
+            # print(f"***** Writing {row_num}.json")
+            json_file.write(as_json(row_j))
+
+        known_sources.setdefault(row_j.source_x, 0)
+        known_sources[row_j.source_x] += 1
+        known_subdirs.setdefault(subdir, 0)
+        known_subdirs[subdir] += 1
+
+    print(f"{row_num} entries written")
+    print("\n*** File by source_x ***")
+    print(known_sources)
+    print("\n*** File by subdirectory ***")
+    print(known_subdirs)
+    print("\n*** Unreferenced files ***")
+    for subdir in data_subdirs:
+        rem_ents = len(subdir_contents[subdir])
+        if rem_ents:
+            print(f"\t{subdir}: {rem_ents}")