Skip to content

Commit 950ee2a

Browse files
committed
New setup script that verifies assembly name, builds Dicey index, removes temporaries
1 parent ed195e9 commit 950ee2a

10 files changed

+295
-0
lines changed

src/setup/__init__.py

100755100644
File mode changed.

src/setup/load.sh

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#!/usr/bin/env bash
2+
3+
set -euo pipefail
4+
5+
JBROWSE_BIN="$1"
6+
SOURCE="$2"
7+
DESTINATION="$3"
8+
SPECIES="$4"
9+
ASSEMBLY="$5"
10+
11+
"$JBROWSE_BIN/prepare-refseqs.pl" --fasta "$SOURCE"/*.processed.fa --out "$DESTINATION"
12+
13+
"$JBROWSE_BIN/flatfile-to-json.pl" \
14+
--gff "$SOURCE"/*.processed.gff3 \
15+
--trackLabel Genes \
16+
--type gene,ncRNA_gene,pseudogene \
17+
--noSubfeatures \
18+
--out "$DESTINATION"
19+
"$JBROWSE_BIN/flatfile-to-json.pl" \
20+
--gff "$SOURCE"/*.processed.gff3 \
21+
--trackLabel Transcripts \
22+
--type transcript,pseudogenic_transcript,mRNA,miRNA,ncRNA,scRNA,snoRNA,snRNA,lnc_RNA,rRNA,tRNA \
23+
--trackType CanvasFeatures \
24+
--out "$DESTINATION"
25+
TARGET=src/setup/trackList_no_regulatory.json
26+
# Regulatory_Build
27+
if compgen -G "$SOURCE"/*.processed.gff; then
28+
TARGET=src/setup/trackList.json
29+
"$JBROWSE_BIN/flatfile-to-json.pl" \
30+
--gff "$SOURCE"/*.processed.gff \
31+
--trackLabel Regulatory_build \
32+
--out "$DESTINATION"
33+
fi
34+
cp "$TARGET" "$DESTINATION/trackList.json"
35+
36+
echo -e "[general]\ndataset_id = $ASSEMBLY" > "$DESTINATION/tracks.conf"
37+
echo -e "[datasets.$ASSEMBLY]\nurl = ?data=data/$ASSEMBLY\nname = $SPECIES ($ASSEMBLY)\n\n" >> jbrowse/jbrowse.conf
38+
touch "$DESTINATION/gRNA_CRISPR.gff"
39+
touch "$DESTINATION/acceptedPrimers.gff"

src/setup/load_geneInfo.py

+85
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
#!/usr/bin/env python3
2+
3+
import json, os, sys
4+
from urllib.parse import quote_plus
5+
6+
from pymongo import MongoClient
7+
8+
9+
dir_path = os.path.dirname(os.path.abspath(__file__))
10+
11+
def load_geneinfo_RGENs(geneInfo_gff, ensembl_version, genome, genome_version,
12+
mongo_username=None, mongo_password=None, mongo_database=None):
13+
14+
''' This function loads gene annoations into Mongo database under collection "geneInfo_<ensembl_version>" '''
15+
gene_info_collection = "geneInfo_" + str(ensembl_version)
16+
meta_data_collection = "metadata"
17+
18+
mongo_uri = "mongodb://localhost:27017"
19+
if mongo_username is not None and mongo_password is not None:
20+
mongo_uri = "mongodb://%s:%s@%s" % (quote_plus(mongo_username), quote_plus(mongo_password), "localhost") #straight up copied from api.mongodb.com
21+
22+
try:
23+
pyMongoClient = MongoClient(mongo_uri)
24+
except Exception as err:
25+
return(err)
26+
27+
print("Successfully connected to Mongodb")
28+
if mongo_database is None: #fix this.
29+
mongo_database = genome_version
30+
31+
# load the RGEN json file into the RGEN database if it doesn't already exist
32+
if 'RGEN' not in pyMongoClient.list_database_names():
33+
rgenDB = pyMongoClient['RGEN']
34+
try:
35+
with open(os.path.join(dir_path,'rgens.json')) as json_file:
36+
rgenJSON = json.load(json_file)
37+
collection = rgenDB['rgenCollection']
38+
collection.insert(rgenJSON)
39+
print("Successfully inserted RGENs into Mongo database")
40+
except Exception as e:
41+
print("Error inserting RGENs into Mongo database: "+ str(e))
42+
else:
43+
print("RGEN collection already exists in Mongo database, will not overwrite")
44+
45+
for collection_name in (gene_info_collection, meta_data_collection):
46+
if collection_name in pyMongoClient[mongo_database].collection_names():
47+
print(collection_name + " already exists in Mongo database")
48+
return True
49+
50+
gene_info_collection_obj = pyMongoClient[mongo_database][gene_info_collection]
51+
meta_data_collection_obj = pyMongoClient[mongo_database][meta_data_collection]
52+
geneDict = {}
53+
with open(geneInfo_gff,"r") as inp_fh:
54+
for line in inp_fh:
55+
if line.startswith('#') is False:
56+
tmpArr = line.split("\t")
57+
if 'gene' in tmpArr[2].lower():
58+
tmpDict = dict([[val for val in column.split("=")] for column in tmpArr[8].split(";")])
59+
tmpDict['ID'] = tmpDict['ID'].replace("gene:","")
60+
if 'Name' not in tmpDict:
61+
tmpDict['Name'] = tmpDict['ID']
62+
if tmpDict['ID'] not in geneDict:
63+
geneDict[tmpDict['ID']] = {"ENSID":tmpDict['ID'],"Name": tmpDict['Name'], "chr":tmpArr[0], "start":int(tmpArr[3]), "end": int(tmpArr[4]), "strand": tmpArr[6]}
64+
try:
65+
gene_info_collection_obj.insert_many(list(geneDict.values()))
66+
gene_info_collection_obj.create_index("ENSID")
67+
meta_data_collection_obj.insert_one({'org_name': genome.lower()})
68+
except Exception as err:
69+
return(err)
70+
71+
print("Succesfully inserted gene annotations and RGENs into Mongo database.")
72+
return True
73+
74+
75+
if __name__ == "__main__":
76+
if len(sys.argv) > 4:
77+
load_geneinfo_RGENs(
78+
geneInfo_gff=sys.argv[1],
79+
ensembl_version=sys.argv[2],
80+
genome=sys.argv[3],
81+
genome_version=sys.argv[4]
82+
)
83+
else:
84+
print(f"Usage: {sys.argv[0]} <genes.gff3> <Ensembl version> <species> <assembly>", file=sys.stderr)
85+
exit(1)

src/setup/process.sh

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/usr/bin/env bash
2+
3+
set -euo pipefail
4+
5+
ASSEMBLY="$1"
6+
7+
SETUP_BIN=$(dirname $(realpath "$0"))
8+
PATH="/var/www/html/bin:/opt/dicey/bin:$PATH"
9+
10+
gunzip --keep *.fa.gz *.gff*.gz
11+
FASTA=$(echo *.fa) # only one
12+
GFF=$(basename --suffix=.gz *.gff*.gz) # one or two, find the extracted files
13+
mkdir -p ../processed
14+
mv "$FASTA" $GFF ../processed
15+
cd ../processed
16+
17+
python3 "$SETUP_BIN/process_fasta.py" "$FASTA"
18+
ln -fs *.processed.fa "$ASSEMBLY.fa"
19+
bwa index "$ASSEMBLY.fa" # creates $ASSEMBLY.bwt
20+
faToTwoBit "$ASSEMBLY.fa" $ASSEMBLY.2bit
21+
samtools faidx "$ASSEMBLY.fa" # creates $ASSEMBLY.fa.fai
22+
# Do I need to use a compressed FASTA instead?
23+
dicey index -o $ASSEMBLY.fa.fm9 "$ASSEMBLY.fa"
24+
25+
python3 "$SETUP_BIN/process_gff3.py" $GFF
26+
rm "$FASTA" $GFF
27+
# *.gff3 is always the non-regulatory build that is always present
28+
"$SETUP_BIN/create_segments.sh" *.fa.fai *.processed.gff3 $ASSEMBLY
29+
30+
mkdir -p ../blastdb
31+
makeblastdb -in $ASSEMBLY.fa -input_type fasta -dbtype nucl \
32+
-title $ASSEMBLY\_blastdb -parse_seqids -out ../blastdb/$ASSEMBLY\_blastdb

src/setup/process_fasta.py

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/usr/bin/env python3
2+
3+
import os, sys
4+
5+
6+
def process_fasta(input_filename: str) -> None:
7+
"""
8+
Adds "chr" to the genome fasta file and removes text after the first space in the header
9+
"""
10+
name, ext = os.path.splitext(input_filename)
11+
output_filename = name + ".processed" + ext
12+
with open(input_filename, "r") as in_file, open(output_filename, "w") as out_file:
13+
for line in in_file:
14+
if line.startswith(">"):
15+
# disregard any text in header after a white space.
16+
tmpList = line.split(" ")
17+
if "chr" in tmpList[0].lower():
18+
out_file.write(tmpList[0] + "\n")
19+
else:
20+
# ">" + chr + chromosome number
21+
out_file.write(tmpList[0][0] + "chr" + tmpList[0][1:] + "\n")
22+
else:
23+
out_file.write(line)
24+
25+
26+
if __name__ == "__main__":
27+
if len(sys.argv) > 1:
28+
for file in sys.argv[1:]:
29+
print(f"Processing {file}...", file=sys.stderr)
30+
process_fasta(file)
31+
else:
32+
print(f"Usage: {sys.argv[0]} <file.fa> [...additional.fa]", file=sys.stderr)
33+
exit(1)

src/setup/process_gff3.py

+89
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
#!/usr/bin/env python3
2+
3+
import os, re, sys
4+
5+
6+
def process_gff3(input_filename: str) -> None:
7+
"""
8+
Adds "chr" to the chromosome column and also copies the ensembl phase information from exon lines to CDS lines for JBrowse
9+
"""
10+
name, ext = os.path.splitext(input_filename)
11+
output_filename = name + ".processed" + ext
12+
all_exons = {}
13+
14+
# slurp file first
15+
with open(input_filename, "r") as in_file:
16+
for line in in_file:
17+
fields = line.split("\t")
18+
if len(fields) == 9 and fields[2] == "exon":
19+
transcript_match = re.search("transcript:(.+?);", fields[8])
20+
if transcript_match is not None and len(transcript_match.groups()) == 1:
21+
transcript = transcript_match.group(1)
22+
if transcript not in all_exons:
23+
all_exons[transcript] = {}
24+
if fields[3] not in all_exons[transcript]:
25+
all_exons[transcript][fields[3]] = {}
26+
if fields[4] not in all_exons[transcript][fields[3]]:
27+
all_exons[transcript][fields[3]][fields[4]] = {}
28+
29+
exon = all_exons[transcript][fields[3]][fields[4]]
30+
ensembl_end_phase = re.search(
31+
"ensembl_end_phase=(.+?);", fields[8]
32+
)
33+
if (
34+
ensembl_end_phase is not None
35+
and len(ensembl_end_phase.groups()) == 1
36+
):
37+
exon["ensembl_end_phase"] = ensembl_end_phase.group(0)
38+
39+
ensembl_phase = re.search("ensembl_phase=(.+?);", fields[8])
40+
if ensembl_phase is not None and len(ensembl_phase.groups()) == 1:
41+
exon["ensembl_phase"] = ensembl_phase.group(0)
42+
43+
# now process the file
44+
with open(input_filename, "r") as in_file, open(output_filename, "w") as out_file:
45+
for line in in_file:
46+
fields = line.split("\t")
47+
if len(fields) == 9:
48+
# if this line is not a header line
49+
if not fields[0].lower().startswith("chr"):
50+
# add chr to the first field if it doesnt start with a chr
51+
fields[0] = "chr" + fields[0]
52+
if fields[2] == "CDS":
53+
# if the line is CDS, add the ensembl end phase and start phase to the line by matching it to the exon_dict dictionary
54+
transcript_match = re.search("transcript:(.+?);", fields[8])
55+
if (
56+
transcript_match is not None
57+
and len(transcript_match.groups()) == 1
58+
):
59+
transcript = transcript_match.group(1)
60+
if transcript in all_exons:
61+
# loop over the exon entries in the exon_dict for this transcript:
62+
for start_pos in all_exons[transcript].keys():
63+
for end_pos in all_exons[transcript][start_pos].keys():
64+
# if exon start or end position is the same as this CDS or if the exon completely includes the CDS:
65+
if (
66+
int(start_pos) == int(fields[3])
67+
or int(end_pos) == int(fields[4])
68+
or (
69+
int(start_pos) < int(fields[3])
70+
and int(end_pos) > int(fields[4])
71+
)
72+
):
73+
exon = all_exons[transcript][start_pos][end_pos]
74+
fields[8] = (
75+
exon["ensembl_end_phase"]
76+
+ exon["ensembl_phase"]
77+
+ fields[8]
78+
)
79+
out_file.write("\t".join(fields))
80+
81+
82+
if __name__ == "__main__":
83+
if len(sys.argv) > 1:
84+
for file in sys.argv[1:]:
85+
print(f"Processing {file}...", file=sys.stderr)
86+
process_gff3(file)
87+
else:
88+
print(f"Usage: {sys.argv[0]} <file.gff3> [...additional.gff3]", file=sys.stderr)
89+
exit(1)

src/setup/rgens.json

100755100644
File mode changed.

src/setup/setup.sh

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/usr/bin/env bash
2+
3+
set -euo pipefail
4+
5+
ENSEMBL="$1"
6+
SPECIES="$2"
7+
ASSEMBLY="$3"
8+
9+
SETUP_BIN=$(dirname $(realpath "$0"))
10+
11+
mkdir -p "jbrowse/data/$ASSEMBLY/downloads"
12+
cd "jbrowse/data/$ASSEMBLY/downloads"
13+
"$SETUP_BIN/download.sh" "$ENSEMBL" "$SPECIES" "$ASSEMBLY"
14+
"$SETUP_BIN/process.sh" "$ASSEMBLY"
15+
cd - >/dev/null
16+
"$SETUP_BIN/load.sh" jbrowse/bin "jbrowse/data/$ASSEMBLY/processed" "jbrowse/data/$ASSEMBLY" "$SPECIES" "$ASSEMBLY"
17+
python3 "$SETUP_BIN/load_geneInfo.py" jbrowse/data/"$ASSEMBLY"/processed/*.processed.gff3 "$ENSEMBL" "$SPECIES" "$ASSEMBLY"

src/setup/trackList.json

100755100644
File mode changed.

src/setup/trackList_no_regulatory.json

100755100644
File mode changed.

0 commit comments

Comments
 (0)