ccmbioinfo
diff --git a/‎src/setup/__init__.py
100755100644 b/‎src/setup/__init__.py
100755100644
diff --git a/‎src/setup/load.sh
+39 b/‎src/setup/load.sh
+39
diff --git a/‎src/setup/load_geneInfo.py
+85 b/‎src/setup/load_geneInfo.py
+85
diff --git a/‎src/setup/process.sh
+32 b/‎src/setup/process.sh
+32
diff --git a/‎src/setup/process_fasta.py
+33 b/‎src/setup/process_fasta.py
+33
diff --git a/‎src/setup/process_gff3.py
+89 b/‎src/setup/process_gff3.py
+89
diff --git a/‎src/setup/rgens.json
100755100644 b/‎src/setup/rgens.json
100755100644
diff --git a/‎src/setup/setup.sh
+17 b/‎src/setup/setup.sh
+17
diff --git a/‎src/setup/trackList.json
100755100644 b/‎src/setup/trackList.json
100755100644
diff --git a/‎src/setup/trackList_no_regulatory.json
100755100644 b/‎src/setup/trackList_no_regulatory.json
100755100644
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+JBROWSE_BIN="$1"
+SOURCE="$2"
+DESTINATION="$3"
+SPECIES="$4"
+ASSEMBLY="$5"
+
+"$JBROWSE_BIN/prepare-refseqs.pl" --fasta "$SOURCE"/*.processed.fa --out "$DESTINATION"
+
+"$JBROWSE_BIN/flatfile-to-json.pl" \
+    --gff "$SOURCE"/*.processed.gff3 \
+    --trackLabel Genes \
+    --type gene,ncRNA_gene,pseudogene \
+    --noSubfeatures \
+    --out "$DESTINATION"
+"$JBROWSE_BIN/flatfile-to-json.pl" \
+    --gff "$SOURCE"/*.processed.gff3 \
+    --trackLabel Transcripts \
+    --type transcript,pseudogenic_transcript,mRNA,miRNA,ncRNA,scRNA,snoRNA,snRNA,lnc_RNA,rRNA,tRNA \
+    --trackType CanvasFeatures \
+    --out "$DESTINATION"
+TARGET=src/setup/trackList_no_regulatory.json
+# Regulatory_Build
+if compgen -G "$SOURCE"/*.processed.gff; then
+    TARGET=src/setup/trackList.json
+    "$JBROWSE_BIN/flatfile-to-json.pl" \
+        --gff "$SOURCE"/*.processed.gff \
+        --trackLabel Regulatory_build \
+        --out "$DESTINATION"
+fi
+cp "$TARGET" "$DESTINATION/trackList.json"
+
+echo -e "[general]\ndataset_id = $ASSEMBLY" > "$DESTINATION/tracks.conf"
+echo -e "[datasets.$ASSEMBLY]\nurl = ?data=data/$ASSEMBLY\nname = $SPECIES ($ASSEMBLY)\n\n" >> jbrowse/jbrowse.conf
+touch "$DESTINATION/gRNA_CRISPR.gff"
+touch "$DESTINATION/acceptedPrimers.gff"
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+
+import json, os, sys
+from urllib.parse import quote_plus
+
+from pymongo import MongoClient
+
+
+dir_path = os.path.dirname(os.path.abspath(__file__))
+
+def load_geneinfo_RGENs(geneInfo_gff, ensembl_version, genome, genome_version,
+    mongo_username=None, mongo_password=None, mongo_database=None):
+
+    ''' This function loads gene annoations into Mongo database under collection "geneInfo_<ensembl_version>" '''
+    gene_info_collection = "geneInfo_" + str(ensembl_version)
+    meta_data_collection = "metadata"
+
+    mongo_uri = "mongodb://localhost:27017"
+    if mongo_username is not None and mongo_password is not None:
+       mongo_uri = "mongodb://%s:%s@%s" % (quote_plus(mongo_username), quote_plus(mongo_password), "localhost") #straight up copied from api.mongodb.com
+
+    try:
+        pyMongoClient = MongoClient(mongo_uri)
+    except Exception as err:
+        return(err)
+
+    print("Successfully connected to Mongodb")
+    if mongo_database is None: #fix this.
+        mongo_database = genome_version
+
+    # load the RGEN json file into the RGEN database if it doesn't already exist
+    if 'RGEN' not in pyMongoClient.list_database_names():
+        rgenDB = pyMongoClient['RGEN']
+        try:
+            with open(os.path.join(dir_path,'rgens.json')) as json_file:
+                rgenJSON = json.load(json_file)
+                collection = rgenDB['rgenCollection']
+                collection.insert(rgenJSON)
+                print("Successfully inserted RGENs into Mongo database")
+        except Exception as e:
+            print("Error inserting RGENs into Mongo database: "+ str(e))
+    else:
+        print("RGEN collection already exists in Mongo database, will not overwrite")
+
+    for collection_name in (gene_info_collection, meta_data_collection):
+        if collection_name in pyMongoClient[mongo_database].collection_names():
+            print(collection_name + " already exists in Mongo database")
+            return True
+
+    gene_info_collection_obj = pyMongoClient[mongo_database][gene_info_collection]
+    meta_data_collection_obj = pyMongoClient[mongo_database][meta_data_collection]
+    geneDict = {}
+    with open(geneInfo_gff,"r") as inp_fh:
+        for line in inp_fh:
+            if line.startswith('#') is False:
+                tmpArr = line.split("\t")
+                if 'gene' in tmpArr[2].lower():
+                    tmpDict = dict([[val for val in column.split("=")] for column in tmpArr[8].split(";")])
+                    tmpDict['ID'] = tmpDict['ID'].replace("gene:","")
+                    if 'Name' not in tmpDict:
+                        tmpDict['Name'] = tmpDict['ID']
+                    if tmpDict['ID'] not in geneDict:
+                        geneDict[tmpDict['ID']] = {"ENSID":tmpDict['ID'],"Name": tmpDict['Name'], "chr":tmpArr[0], "start":int(tmpArr[3]), "end": int(tmpArr[4]), "strand": tmpArr[6]}
+    try:
+        gene_info_collection_obj.insert_many(list(geneDict.values()))
+        gene_info_collection_obj.create_index("ENSID")
+        meta_data_collection_obj.insert_one({'org_name': genome.lower()})
+    except Exception as err:
+        return(err)
+
+    print("Succesfully inserted gene annotations and RGENs into Mongo database.")
+    return True
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 4:
+        load_geneinfo_RGENs(
+            geneInfo_gff=sys.argv[1],
+            ensembl_version=sys.argv[2],
+            genome=sys.argv[3],
+            genome_version=sys.argv[4]
+        )
+    else:
+        print(f"Usage: {sys.argv[0]} <genes.gff3> <Ensembl version> <species> <assembly>", file=sys.stderr)
+        exit(1)
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+ASSEMBLY="$1"
+
+SETUP_BIN=$(dirname $(realpath "$0"))
+PATH="/var/www/html/bin:/opt/dicey/bin:$PATH"
+
+gunzip --keep *.fa.gz *.gff*.gz
+FASTA=$(echo *.fa) # only one
+GFF=$(basename --suffix=.gz *.gff*.gz) # one or two, find the extracted files
+mkdir -p ../processed
+mv "$FASTA" $GFF ../processed
+cd ../processed
+
+python3 "$SETUP_BIN/process_fasta.py" "$FASTA"
+ln -fs *.processed.fa "$ASSEMBLY.fa"
+bwa index "$ASSEMBLY.fa" # creates $ASSEMBLY.bwt
+faToTwoBit "$ASSEMBLY.fa" $ASSEMBLY.2bit
+samtools faidx "$ASSEMBLY.fa" # creates $ASSEMBLY.fa.fai
+# Do I need to use a compressed FASTA instead?
+dicey index -o $ASSEMBLY.fa.fm9 "$ASSEMBLY.fa"
+
+python3 "$SETUP_BIN/process_gff3.py" $GFF
+rm "$FASTA" $GFF
+# *.gff3 is always the non-regulatory build that is always present
+"$SETUP_BIN/create_segments.sh" *.fa.fai *.processed.gff3 $ASSEMBLY
+
+mkdir -p ../blastdb
+makeblastdb -in $ASSEMBLY.fa -input_type fasta -dbtype nucl \
+    -title $ASSEMBLY\_blastdb -parse_seqids -out ../blastdb/$ASSEMBLY\_blastdb
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+
+import os, sys
+
+
+def process_fasta(input_filename: str) -> None:
+    """
+    Adds "chr" to the genome fasta file and removes text after the first space in the header
+    """
+    name, ext = os.path.splitext(input_filename)
+    output_filename = name + ".processed" + ext
+    with open(input_filename, "r") as in_file, open(output_filename, "w") as out_file:
+        for line in in_file:
+            if line.startswith(">"):
+                # disregard any text in header after a white space.
+                tmpList = line.split(" ")
+                if "chr" in tmpList[0].lower():
+                    out_file.write(tmpList[0] + "\n")
+                else:
+                    # ">" + chr + chromosome number
+                    out_file.write(tmpList[0][0] + "chr" + tmpList[0][1:] + "\n")
+            else:
+                out_file.write(line)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        for file in sys.argv[1:]:
+            print(f"Processing {file}...", file=sys.stderr)
+            process_fasta(file)
+    else:
+        print(f"Usage: {sys.argv[0]} <file.fa> [...additional.fa]", file=sys.stderr)
+        exit(1)
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+
+import os, re, sys
+
+
+def process_gff3(input_filename: str) -> None:
+    """
+    Adds "chr" to the chromosome column and also copies the ensembl phase information from exon lines to CDS lines for JBrowse
+    """
+    name, ext = os.path.splitext(input_filename)
+    output_filename = name + ".processed" + ext
+    all_exons = {}
+
+    # slurp file first
+    with open(input_filename, "r") as in_file:
+        for line in in_file:
+            fields = line.split("\t")
+            if len(fields) == 9 and fields[2] == "exon":
+                transcript_match = re.search("transcript:(.+?);", fields[8])
+                if transcript_match is not None and len(transcript_match.groups()) == 1:
+                    transcript = transcript_match.group(1)
+                    if transcript not in all_exons:
+                        all_exons[transcript] = {}
+                    if fields[3] not in all_exons[transcript]:
+                        all_exons[transcript][fields[3]] = {}
+                    if fields[4] not in all_exons[transcript][fields[3]]:
+                        all_exons[transcript][fields[3]][fields[4]] = {}
+
+                    exon = all_exons[transcript][fields[3]][fields[4]]
+                    ensembl_end_phase = re.search(
+                        "ensembl_end_phase=(.+?);", fields[8]
+                    )
+                    if (
+                        ensembl_end_phase is not None
+                        and len(ensembl_end_phase.groups()) == 1
+                    ):
+                        exon["ensembl_end_phase"] = ensembl_end_phase.group(0)
+
+                    ensembl_phase = re.search("ensembl_phase=(.+?);", fields[8])
+                    if ensembl_phase is not None and len(ensembl_phase.groups()) == 1:
+                        exon["ensembl_phase"] = ensembl_phase.group(0)
+
+    # now process the file
+    with open(input_filename, "r") as in_file, open(output_filename, "w") as out_file:
+        for line in in_file:
+            fields = line.split("\t")
+            if len(fields) == 9:
+                # if this line is not a header line
+                if not fields[0].lower().startswith("chr"):
+                    # add chr to the first field if it doesnt start with a chr
+                    fields[0] = "chr" + fields[0]
+                if fields[2] == "CDS":
+                    # if the line is CDS, add the ensembl end phase and start phase to the line by matching it to the exon_dict dictionary
+                    transcript_match = re.search("transcript:(.+?);", fields[8])
+                    if (
+                        transcript_match is not None
+                        and len(transcript_match.groups()) == 1
+                    ):
+                        transcript = transcript_match.group(1)
+                        if transcript in all_exons:
+                            # loop over the exon entries in the exon_dict for this transcript:
+                            for start_pos in all_exons[transcript].keys():
+                                for end_pos in all_exons[transcript][start_pos].keys():
+                                    # if exon start or end position is the same as this CDS or if the exon completely includes the CDS:
+                                    if (
+                                        int(start_pos) == int(fields[3])
+                                        or int(end_pos) == int(fields[4])
+                                        or (
+                                            int(start_pos) < int(fields[3])
+                                            and int(end_pos) > int(fields[4])
+                                        )
+                                    ):
+                                        exon = all_exons[transcript][start_pos][end_pos]
+                                        fields[8] = (
+                                            exon["ensembl_end_phase"]
+                                            + exon["ensembl_phase"]
+                                            + fields[8]
+                                        )
+            out_file.write("\t".join(fields))
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        for file in sys.argv[1:]:
+            print(f"Processing {file}...", file=sys.stderr)
+            process_gff3(file)
+    else:
+        print(f"Usage: {sys.argv[0]} <file.gff3> [...additional.gff3]", file=sys.stderr)
+        exit(1)
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+ENSEMBL="$1"
+SPECIES="$2"
+ASSEMBLY="$3"
+
+SETUP_BIN=$(dirname $(realpath "$0"))
+
+mkdir -p "jbrowse/data/$ASSEMBLY/downloads"
+cd "jbrowse/data/$ASSEMBLY/downloads"
+"$SETUP_BIN/download.sh" "$ENSEMBL" "$SPECIES" "$ASSEMBLY"
+"$SETUP_BIN/process.sh" "$ASSEMBLY"
+cd - >/dev/null
+"$SETUP_BIN/load.sh" jbrowse/bin "jbrowse/data/$ASSEMBLY/processed" "jbrowse/data/$ASSEMBLY" "$SPECIES" "$ASSEMBLY"
+python3 "$SETUP_BIN/load_geneInfo.py" jbrowse/data/"$ASSEMBLY"/processed/*.processed.gff3 "$ENSEMBL" "$SPECIES" "$ASSEMBLY"