To use Python 3 on the cluster you can use a module that has it as the default. In BASH on the command line you will do
module load miniconda3
conda activate GEN220 # use an existing gen220 environment
Do this one time (make sure you did the step above to activate the environment)
python3 -m ipykernel install --user --name GEN220 --display-name "Python (GEN220)"
This will load an environment where python3 is the python installed.
You can check that this works by doing python -V and see the version is 3.XX
This will allow you to install python packages as well as any other pkgs you might want to.
Make your own.
 mkdir ~/bigdata/.conda
 ln -s ~/bigdata/.conda ~/.conda
 conda create -y -n mygen220 python=3 mamba
 source activate mygen220
 conda config --add channels defaults
 conda config --add channels bioconda
 conda config --add channels conda-forge
 mamba install biopython bcbio-gff
 mamba install ipykernel
 python3 -m ipykernel install --user --name gen220 --display-name "Python (GEN220)"
# Installed kernelspec gen220 in /rhome/USER/.local/share/jupyter/kernels/gen220
Python has many built in packages already installed and a whole host of contributions that span everything from plotting data, interfacing with other datatypes.
Can open a file that is compressed or write to a file already compressed. This can save space for large files or when you get data from a resource without having to decompress it.
import gzip
with gzip.open(file,"rt") as fh:
    for line in fh:
        print("The first line from uncompressed")
	break
# this is a URL at the UniProt database to get a protein sequence based on
# accession number
import urllib.request
url="https://www.uniprot.org/uniprot/P10127.fasta"
seqdata = urllib.request.urlopen(url)
for line in seqdata:
    linestrip = line.decode('UTF-8').strip()
    print(linestrip)
Modules for Sequence data, BLAST parsing, Multiple alignments
Already installed on biocluster
To installed on own computer you control use Python tool ‘pip’
$ conda create -y -n gen220 # only need to do this once
$ source activate gen220
$ conda install biopython # if you forget to do the line above it will fail
import Bio
from Bio.Seq import Seq
my_seq = Seq("ATGAGTACACTAGGGTAA")
print(my_seq)
rc = my_seq.reverse_complement()
pep = my_seq.translate()
print("revcom is", rc)
print(pep)
more /bigdata/gen220/share/data/E3Q6S8.fasta
>tr|E3Q6S8|E3Q6S8_COLGM RNAse P Rpr2/Rpp21/SNM1 subunit domain-containing protein OS=Colletotrichum graminicola (strain M1.001 / M2 / FGSC 10212) GN=GLRG_02386 PE=4 SV=1
MAKPKSESLPNRHAYTRVSYLHQAAAYLATVQSPTSDSTTNSSQPGHAPHAVDHERCLET
NETVARRFVSDIRAVSLKAQIRPSPSLKQMMCKYCDSLLVEGKTCSTTVENASKGGKKPW
ADVMVTKCKTCGNVKRFPVSAPRQKRRPFREQKAVEGQDTTPAVSEMSTGAD
To process this file:
import sys
import Bio
from Bio import SeqIO
from Bio.Seq import Seq
# seqfile
filename = "/bigdata/gen220/shared/data/E3Q6S8.fasta"
for seq_record in SeqIO.parse( filename , "fasta"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(seq_record.seq)
    print(len(seq_record))
This will output:
tr|E3Q6S8|E3Q6S8_COLGM
Seq('MAKPKSESLPNRHAYTRVSYLHQAAAYLATVQSPTSDSTTNSSQPGHAPHAVDH...GAD',
SingleLetterAlphabet())
MAKPKSESLPNRHAYTRVSYLHQAAAYLATVQSPTSDSTTNSSQPGHAPHAVDHERCLETNETVARRFVSDIRAVSLKAQIRPSPSLKQMMCKYCDSLLVEGKTCSTTVENASKGGKKPWADVMVTKCKTCGNVKRFPVSAPRQKRRPFREQKAVEGQDTTPAVSEMSTGAD
172
LOCUS       AJ240084                1905 bp    DNA     linear   PRI 03-FEB-2000
DEFINITION  Homo sapiens TRIM gene, promoter.
ACCESSION   AJ240084
VERSION     AJ240084.1  GI:6911579
KEYWORDS    T-cell receptor interacting molecule; TRIM gene.
SOURCE      Homo sapiens (human)
  ORGANISM  Homo sapiens
      Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
      Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
      Catarrhini; Hominidae; Homo.
REFERENCE   1
AUTHORS   Hubener,C., Mincheva,A., Lichter,P., Schraven,B. and Bruyns,E.
TITLE     Genomic organization and chromosomal localization of the human gene
          encoding the T-cell receptor-interacting molecule (TRIM)
JOURNAL   Immunogenetics 51 (2), 154-158 (2000)
  PUBMED   10663578
REFERENCE   2  (bases 1 to 1905)
  AUTHORS   Huebener,C.
  TITLE     Direct Submission
  JOURNAL   Submitted (06-MAY-1999) Huebener C., Immunomodulation Laboratory,
            Institute for Immunology, University of Heidelberg, Im Neuenheimer
            Feld 305, Heidelberg, 69120, GERMANY
FEATURES             Location/Qualifiers
     source          1..1905
                     /organism="Homo sapiens"
                     /mol_type="genomic DNA"
                     /db_xref="taxon:9606"
                     /clone_lib="RPCI1,3-5 Human PAC library"
     gene            1..1902
                     /gene="TRIM"
     regulatory      1..1746
                     /regulatory_class="promoter"
                     /gene="TRIM"
     5'UTR           1747..1902
                     /gene="TRIM"
ORIGIN
   1 ccaaaaattt ccagtcctga aaccctttct ctttccaatg tcctctgtaa gctcgagttg
  61 tgggcatcta ctttgcccat attccaaggt cttgcttagg taacctctgt agtcctttct
 121 tgagcctagg acttctactt ttcttaccag ttaccctctt tcaggaccaa agctcaactc
 181 ctcaaggcca taactaggcc ctctcctctc aaactgattt atcaggtgcc cgaatcttcc
 241 tgaatgtctg ggattcaact tttcagcagt cttcctccct acgttccatc taattctaag
 301 atgaaacctt ctgattcttt gttgtcctct gatccctaca tgaacctgag gctgctgttc
 361 cctgaagtct tgttctgtca gcatccaggc ctgcttcata aaacctgtca ctctgctaat
 421 ggttagcggc tgaacaaaga gtcctctggc caaataagtt tagaaaaact ctgataaaaa
 481 tattatttgg gtttcctttt cgcaggactt acctaagcct ttaatatgca tctacggagg
 541 taaaaataaa gctatatatt ttttccaaag atatttgttg aagaaacatt tgtcttctgc
 601 gtttcttaaa ggccgagtgt tctatggaac atactttaaa aaaccctttt aaagaagctt
 661 agaccagaga atctccaagg tctctttcag ttttacagcc tctgagtcaa cgattcacca
 721 aaaaatattt tggggggaag tgattgaagt ggaaaaattt gttagtgttt agccagcttt
 781 gtccaaagga taagatgcac tgtattttgc ttactaggga gttattttct ataatggaag
 841 acaaagaaag cacaagacac ccatggtttt gtttgttcaa tcactgagag taagtctcaa
 901 ttattgagac ttacgatgtg ccggtgtgct taattctagt tatgaaattt taataatgaa
 961 taatatagat tctattcctt atatgagttt ccaaaagcat tgtccagaac atctatatta
1021 aaatatctta tcatatacaa tatatgtaat ttaaaatgca ctcagaaaat ctgcttgtta
1081 aaatgcagat tctagtgctt caccctaaat agtctaattt agacgggccc aggattttaa
1141 actagcatct tatagcatac ttatgtacac caacatgtaa gaactgctgc tattaagatt
1201 ctgggatggt ggttgagaac aggagcttgt tgtcaggtgg ctctagattg gacagagaaa
1261 ctcatactga taaggtgagg attgtcagga aataaggcag gcatctagcc tcgcattaag
1321 atgaggtata gaaggcaact gatacatact aagtgctcaa aaaatattaa ctccctgtcc
1381 tccatcatgg ctcaagaaaa tacaacagct gagcacaccc acgggttgct tactatttac
1441 ttatcagttt agtgtatctt attttgtttc catgtgaatt tacttgtgaa gagatgactg
1501 gattctctcc agagatagga agatccctcc tggtttaatt cctaccttta tttatttatt
1561 tttcaattag actcaggtat tgataaaaat tcaaatgtca gattacaaag gtgtgtggga
1621 tttttcttcc cacgttacac aatttaagtc gactgttttc agatcaaaac tcaagacaac
1681 tccttcacca catttcctgt ttgtaactga aacaaagtac acacaaaaga ttttaagaaa
1741 cagaagagaa aagaatccga ggcacagata aagataagtt ttactgtcat gctgctttta
1801 acataacaga gcaacatcac ctaggaaaaa agtttgtagg aggattttta atccatatat
1861 ttgtcttatg gctagataaa gatttctccg aaaaaaagaa gcatg
//
To Parse the genbank file - it is the same code! Just change the format.
import sys
import Bio
from Bio import SeqIO
from Bio.Seq import Seq
# seqfile
filename = "/bigdata/gen220/shared/data/AJ240084.gbk"
for seq_record in SeqIO.parse( filename , "genbank"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(seq_record.seq)
    print(len(seq_record))
Produces
python bp_parse_gbk.py ../data/AJ240084_TRIM.gbk
AJ240084.1
Seq('CCAAAAATTTCCAGTCCTGAAACCCTTTCTCTTTCCAATGTCCTCTGTAAGCTC...ATG',
IUPACAmbiguousDNA())
CCAAAAATTTCCAGTCCTGAAACCCTTTCTCTTTCCAATGTCCTCTGTAAGCTCGAGTTGTGGGCATCTACTTTGCCCATATTCCAAGGTCTTGCTTAGGTAACCTCTGTAGTCCTTTCTTGAGCCTAGGACTTCTACTTTTCTTACCAGTTACCCTCTTTCAGGACCAAAGCTCAACTCCTCAAGGCCATAACTAGGCCCTCTCCTCTCAAACTGATTTATCAGGTGCCCGAATCTTCCTGAATGTCTGGGATTCAACTTTTCAGCAGTCTTCCTCCCTACGTTCCATCTAATTCTAAGATGAAACCTTCTGATTCTTTGTTGTCCTCTGATCCCTACATGAACCTGAGGCTGCTGTTCCCTGAAGTCTTGTTCTGTCAGCATCCAGGCCTGCTTCATAAAACCTGTCACTCTGCTAATGGTTAGCGGCTGAACAAAGAGTCCTCTGGCCAAATAAGTTTAGAAAAACTCTGATAAAAATATTATTTGGGTTTCCTTTTCGCAGGACTTACCTAAGCCTTTAATATGCATCTACGGAGGTAAAAATAAAGCTATATATTTTTTCCAAAGATATTTGTTGAAGAAACATTTGTCTTCTGCGTTTCTTAAAGGCCGAGTGTTCTATGGAACATACTTTAAAAAACCCTTTTAAAGAAGCTTAGACCAGAGAATCTCCAAGGTCTCTTTCAGTTTTACAGCCTCTGAGTCAACGATTCACCAAAAAATATTTTGGGGGGAAGTGATTGAAGTGGAAAAATTTGTTAGTGTTTAGCCAGCTTTGTCCAAAGGATAAGATGCACTGTATTTTGCTTACTAGGGAGTTATTTTCTATAATGGAAGACAAAGAAAGCACAAGACACCCATGGTTTTGTTTGTTCAATCACTGAGAGTAAGTCTCAATTATTGAGACTTACGATGTGCCGGTGTGCTTAATTCTAGTTATGAAATTTTAATAATGAATAATATAGATTCTATTCCTTATATGAGTTTCCAAAAGCATTGTCCAGAACATCTATATTAAAATATCTTATCATATACAATATATGTAATTTAAAATGCACTCAGAAAATCTGCTTGTTAAAATGCAGATTCTAGTGCTTCACCCTAAATAGTCTAATTTAGACGGGCCCAGGATTTTAAACTAGCATCTTATAGCATACTTATGTACACCAACATGTAAGAACTGCTGCTATTAAGATTCTGGGATGGTGGTTGAGAACAGGAGCTTGTTGTCAGGTGGCTCTAGATTGGACAGAGAAACTCATACTGATAAGGTGAGGATTGTCAGGAAATAAGGCAGGCATCTAGCCTCGCATTAAGATGAGGTATAGAAGGCAACTGATACATACTAAGTGCTCAAAAAATATTAACTCCCTGTCCTCCATCATGGCTCAAGAAAATACAACAGCTGAGCACACCCACGGGTTGCTTACTATTTACTTATCAGTTTAGTGTATCTTATTTTGTTTCCATGTGAATTTACTTGTGAAGAGATGACTGGATTCTCTCCAGAGATAGGAAGATCCCTCCTGGTTTAATTCCTACCTTTATTTATTTATTTTTCAATTAGACTCAGGTATTGATAAAAATTCAAATGTCAGATTACAAAGGTGTGTGGGATTTTTCTTCCCACGTTACACAATTTAAGTCGACTGTTTTCAGATCAAAACTCAAGACAACTCCTTCACCACATTTCCTGTTTGTAACTGAAACAAAGTACACACAAAAGATTTTAAGAAACAGAAGAGAAAAGAATCCGAGGCACAGATAAAGATAAGTTTTACTGTCATGCTGCTTTTAACATAACAGAGCAACATCACCTAGGAAAAAAGTTTGTAGGAGGATTTTTAATCCATATATTTGTCTTATGGCTAGATAAAGATTTCTCCGAAAAAAAGAAGCATG
1905
See documentation on SeqIO here and the tutorial
#!/usr/bin/env python3
import sys
import Bio
from Bio import SeqIO
from Bio.Seq import Seq
filename = sys.argv[1]
for seq_record in SeqIO.parse( filename , "genbank"):
    print(seq_record.id)
    for feature in seq_record.features:
        print("\t",feature.type,feature.location)
        print("\t",feature.type,feature.location.start,
	feature.location.end, feature.location.strand)
Here’s a FASTA data file you have downloaded before - vector sequences in NCBI BLAST database.
$ head /bigdata/gen220/shared/data/vector
>gi|413493|emb|A08585.1| pMG36 expression vector DNA sequence
ATGGCAATCGTTTCAGCAGAAAAATTCGTAATTCGAGCTCGCCCGGGGATCGATCCTCTAGAGTCGACCT
GCAGGCATGCAAGCTT
>gi|513652|emb|A13775.1| oligonucleotide 14
CGAATCGTAACCGTTCGTACGAGAATCGCTGTCCTCTCCTTC
from Bio import SeqIO
infile="/bigdata/gen220/shared/data/vector"
handle = open(infile, "rU")
record_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
print(record_dict.keys)
handle.close()
print(record_dict[">gi|513652|emb|A13775.1|"]) #use any record ID
<built-in method keys of dict object at 0x2ba6cfbfb1b0>
ID: gi|1052556|emb|Z50149.1|
Name: gi|1052556|emb|Z50149.1|
Description: gi|1052556|emb|Z50149.1| Artificial DNA for plasmid pMin2
Number of features: 0
Seq('GAATTCTCATGTTTGACAGCTTATCATTTGGCAATAAAACACCAAAATGAATGA...TTT', SingleLetterAlphabet())
From GenBank to Fasta
from Bio import SeqIO
infile="/bigdata/gen220/shared/data/cor6_6.gbk"
input_handle = open(infile, "rt")
output_handle = open("cor6_6.fasta", "w")
sequences = SeqIO.parse(input_handle, "genbank")
count = SeqIO.write(sequences, output_handle, "fasta")
output_handle.close()
input_handle.close()
Or even more simply
from Bio import SeqIO
infile="/bigdata/gen220/shared/data/cor6_6.gbk"
count = SeqIO.convert(infile, "genbank", "cor6_6.fasta", "fasta")
print("Converted %i records" % count)
#!/usr/bin/env python3
from urllib.request import urlopen
from io import TextIOWrapper
from Bio import SeqIO
url="https://www.uniprot.org/uniprot/P10127.fasta"
handle = TextIOWrapper(urlopen(url))
sequences = SeqIO.parse(handle, "fasta")
for seq in sequences:
	print("seq is ",seq.id, " ", seq.seq)
Pairwise Alignment Parsing (BLAST, FASTA, HMMER)
Multiple Alignment Parsing
Database access (local, fast indexed files; Remote databases via Web)
Some Graphics drawing support
GFF parsing - http://biopython.org/wiki/GFF_Parsing
bcbio-gff package in your local conda#!/usr/bin/env python3
import os
gffurl="https://fungidb.org/common/downloads/release-45/ScerevisiaeS288c/gff/data/FungiDB-45_ScerevisiaeS288c.gff"
in_file = os.path.basename(gffurl)
if not os.path.exists(in_file):
    os.system("curl -O "+gffurl)
from BCBio import GFF
filter_options = dict(
        gff_id = ["BK006934"], # this specifies the names of the chromosome/contigs you want to include
        gff_type = ["CDS"], # this limits the type of features to include
        gff_source = ["EuPathDB"])
# this option uses the limit_info value
# filter_options is a dictionary that specifies the subset of features to print out
in_handle = open(in_file)
for rec in GFF.parse(in_handle, limit_info=filter_options):
    for feat in rec.features:
      # this prints out the feature object
      # this prints out a default options which provides some default printing
        print(feat)
        break  # this stops after the first feature, just printing out
in_handle.close()
in_handle = open(in_file)
# this is same as above but doesn't use the filter
for rec in GFF.parse(in_handle):
    for feat in rec.features:
        print(feat)
        break # this stops after the first feature, just printing out
in_handle.close()