1
+ """
2
+ Process the all_sources_metadata file producing both a JSON and an RDF rendering in the metadata output directory
3
+ """
4
+ import os
5
+ import string
6
+ from csv import DictReader
7
+ from typing import Optional , Set , Dict , List , Tuple
8
+
9
+ from jsonasobj import as_json , JsonObj
10
+ from rdflib import Namespace
11
+
12
+ from scripts .metadata import prefixes , DATASETS_DIR , SOURCE_DIR , METADATA_DIR
13
+
14
+ MISSING_FILE = 'MISSING'
15
+
16
+ # Root of datasets
17
+ data_subdirs = [e for e in os .listdir (DATASETS_DIR ) if e != 'metadata' and '.' not in e ]
18
+ subdir_contents : Dict [str , Set [str ]] = dict ()
19
+
20
+ # TODO: Can we find WHO Covidence?
21
+ IDENTIFIERS : List [Tuple [str , Namespace ]] = [
22
+ ('pubmed_id' , prefixes .PUBMED ),
23
+ ('pmcid' , prefixes .PMC ),
24
+ ('doi' , prefixes .DOI ),
25
+ ('Microsoft Academic Paper Id' , prefixes .MS_ACADEMIC )
26
+ ]
27
+
28
+ SUBDIR_MAP : Dict [str , str ] = {
29
+ "biorxiv_medrxiv" : "bioRxiv-medRxiv" ,
30
+ "comm_use_subset" : "Commercial" ,
31
+ "pmc_custom_license" : "PMC" ,
32
+ "noncomm_use_subset" : "Non-comercial"
33
+ }
34
+
35
+ def generate_identifier (entry : JsonObj ) -> None :
36
+ """
37
+ Generate an "id" entry for entry
38
+ :param entry: metadata entry
39
+ """
40
+ if hasattr (entry , 'sha' ):
41
+ row_j .id = row_j .sha
42
+ else :
43
+ for identifier , namespace in IDENTIFIERS :
44
+ if hasattr (entry , identifier ):
45
+ row_j .id = namespace [entry [identifier ].split ()[0 ]]
46
+ break
47
+
48
+
49
+ def normalize_namespaces (entry : JsonObj ) -> None :
50
+ """ Some of the identifiers are actual multiple occurrences. Instead of representing this as a list, the
51
+ metadata represents this as space separated values. Turn them into lists if needed
52
+ """
53
+ for identifier , _ in IDENTIFIERS :
54
+ ids = getattr (entry , identifier , None )
55
+ if ids and ' ' in ids :
56
+ setattr (entry , identifier , ids .split ())
57
+
58
+
59
+ # Generate a list of all the files we know about
60
+ for subdir in data_subdirs :
61
+ for fname in os .listdir (os .path .join (DATASETS_DIR , subdir )):
62
+ if fname [0 ] in string .hexdigits and fname .endswith (".json" ):
63
+ subdir_contents .setdefault (subdir , set ()).add (fname )
64
+ subdir_contents [MISSING_FILE ] = set ()
65
+
66
+
67
+ def which_subdir (sha : str ) -> Optional [str ]:
68
+ """ Determine which subset (if any) sha is represented in """
69
+ fname = sha + '.json'
70
+ for k , v in subdir_contents .items ():
71
+ if fname in v :
72
+ subdir_contents [k ].remove (fname )
73
+ return k
74
+ subdir_contents [MISSING_FILE ].add (fname )
75
+ return MISSING_FILE
76
+
77
+
78
+ print ("*** Starting Content ***" )
79
+ for subdir in data_subdirs :
80
+ print (f"\t { subdir } : { len (subdir_contents [subdir ])} " )
81
+ print ()
82
+
83
+ # This loads the metadata file as a Python dictionary and then emits the first row in JSON
84
+ with open (os .path .join (SOURCE_DIR , 'all_sources_metadata_2020-03-13.csv' )) as f :
85
+ reader = DictReader (f )
86
+
87
+ known_sources = dict ()
88
+ known_subdirs = dict ()
89
+ source_x_to_subdir = dict ()
90
+
91
+ row_num = 0
92
+ for row in reader :
93
+ row_num += 1
94
+ row_j = JsonObj (** {k : v for k , v in row .items () if v != "" })
95
+ generate_identifier (row_j )
96
+ normalize_namespaces (row_j )
97
+
98
+ if hasattr (row_j , "sha" ):
99
+ subdir = which_subdir (row_j .sha )
100
+ if subdir in SUBDIR_MAP :
101
+ row_j .fhir_link = SUBDIR_MAP [subdir ] + '/' + row_j .sha
102
+
103
+ if hasattr (row_j , "authors" ):
104
+ row_j .authors = [a .trim () for a in row_j .authors .split (';' )]
105
+ with open (os .path .join (METADATA_DIR , f'e{ row_num } .json' ), 'w' ) as json_file :
106
+ # print(f"***** Writing {row_num}.json")
107
+ json_file .write (as_json (row_j ))
108
+
109
+ known_sources .setdefault (row_j .source_x , 0 )
110
+ known_sources [row_j .source_x ] += 1
111
+ known_subdirs .setdefault (subdir , 0 )
112
+ known_subdirs [subdir ] += 1
113
+
114
+ print (f"{ row_num } entries written" )
115
+ print ("\n *** File by source_x ***" )
116
+ print (known_sources )
117
+ print ("\n *** File by subdirectory ***" )
118
+ print (known_subdirs )
119
+ print ("\n *** Unreferenced files ***" )
120
+ for subdir in data_subdirs :
121
+ rem_ents = len (subdir_contents [subdir ])
122
+ if rem_ents :
123
+ print (f"\t { subdir } : { rem_ents } " )
0 commit comments