pwd
u'/Users/sr320/Dropbox/Steven/eimd/ipynbs'
cd
/Users/sr320
ls
Applications/ Geneious 5.5 Data/ anaconda/ BiGo_RNAseq.ipynb HOBOwareInstall.txt apollo-webstart/ BlackAb_Annot.ipynb Library/ asample.html CLC_Data/ Movies/ blast2go/ ClueGOConfiguration/ Music/ igv/ CytoscapeConfiguration/ Pictures/ outputfile Desktop/ Public/ sqlshare-pythonclient/ Documents/ StencylWorks.prefs stencylworks/ Downloads/ TJGR_pearl.ipynb test Dropbox/ VirtualBox VMs/
cd Desktop/
/Users/sr320/Desktop
mkdir myawesome_dir
ls -d my*
myawesome_dir/
!curl -O http://eagle.fish.washington.edu/whale/eimd_14/Phel_transcriptome_clc.fa
% Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 43.2M 100 43.2M 0 0 756k 0 0:00:58 0:00:58 --:--:-- 417k
!head Phel_transcriptome_clc.fa
>Phel_clc_contig_1 CAAATATATGAACGGTTGATTGTCAACGATTAGTACATGTTTTCATTGTTCCCCACGCCC GCCCCCCCCCACTCAAACATTTAAAGTGTGAAATATTATTTATCCACAAATTTCCTTAAA CCTGCAAACTTGTCTGCTGTCTCTTATTGGAAGTTATGAAAAAGAACAACGGGTTTTCTT TAAAGGGTCTGCGTGCGATTTTCAACCTTTTGAGTAATAGCAGTTATTTTGATAACCGAT TTTTTTCAAAGCTCAACAGCTTTTTAAAATAAGGAATCCTATAATGGCCAAACGAATACT ATAAAAATAAGGGTTCTCTTAATTGTATAAAACGTATAATTTTATCAATTTTGGGACCGT GTAATTTTTTAAAGACCACAAGAATGTTACATACAACAAATAGACGAAACTCGTAGCTTT GGAAACTACGTCATGGGCGTTTGGTCAAAAGCTGGAGAGAAAGAGAGGTGGGGTGCCAGA CTTAAGTAGTCACGTGATCTGACCAACGCACATCGGAAGCTCGATCGGATGAAATCTTCT
!fgrep -c ">" Phel_transcriptome_clc.fa
30578
Convert fasta to tabular
!perl -e '$count=0; $len=0; while(<>) {s/\r?\n//; s/\t/ /g; if (s/^>//) { if ($. != 1) {print "\n"} s/ |$/\t/; $count++; $_ .= "\t";} else {s/ //g; $len += length($_)} print $_;} print "\n"; warn "\nConverted $count FASTA records in $. lines to tabular format\nTotal sequence length: $len\n\n";' Phel_transcriptome_clc.fa > Phel_transcriptome_clc.tab
Converted 30578 FASTA records in 778053 lines to tabular format Total sequence length: 43945151
!head -1 Phel_transcriptome_clc.tab
Phel_clc_contig_1 CAAATATATGAACGGTTGATTGTCAACGATTAGTACATGTTTTCATTGTTCCCCACGCCCGCCCCCCCCCACTCAAACATTTAAAGTGTGAAATATTATTTATCCACAAATTTCCTTAAACCTGCAAACTTGTCTGCTGTCTCTTATTGGAAGTTATGAAAAAGAACAACGGGTTTTCTTTAAAGGGTCTGCGTGCGATTTTCAACCTTTTGAGTAATAGCAGTTATTTTGATAACCGATTTTTTTCAAAGCTCAACAGCTTTTTAAAATAAGGAATCCTATAATGGCCAAACGAATACTATAAAAATAAGGGTTCTCTTAATTGTATAAAACGTATAATTTTATCAATTTTGGGACCGTGTAATTTTTTAAAGACCACAAGAATGTTACATACAACAAATAGACGAAACTCGTAGCTTTGGAAACTACGTCATGGGCGTTTGGTCAAAAGCTGGAGAGAAAGAGAGGTGGGGTGCCAGACTTAAGTAGTCACGTGATCTGACCAACGCACATCGGAAGCTCGATCGGATGAAATCTTCTCTATCGTTCTTGCGTCTATACGTGCTACGAAGAGCTGACAGAAGTTTGGACTTGTTTACTTCTTGCACCTGTTGATGGAACGGCCACGGACCTTGTCGCACGCACACCTGGAGCCAGTGCTCGGATCGACGCAACGGATGTACTGTCTTCCCCTTCCGCGTTTCTCAAGTAGGTACTCAAAGTCGTCCGCGTCGAAGTTGGCCTCGGCGTCCCTCTTCTCCAGCTCCTCCATGTCCTCCTCTGTGTAGTACGGGGTGACGAGCACCACCAGGGCGGCCACAATGGCCAGTGCTAGAAGACACTTCGTATTCATTCTGCTGGTGGTTGGATGTGCGCAAACAAGACAGGAGAGACTTATTAGAATC
!perl -e '$col = 2;' -e 'while (<>) { s/\r?\n//; @F = split /\t/, $_; $len = length($F[$col]); print "$_\t$len\n" } warn "\nAdded column with length of column $col for $. lines.\n\n";' Phel_transcriptome_clc.tab > Phel_transcriptome_clc_length.tab
Added column with length of column 2 for 30578 lines.
!head -1 Phel_transcriptome_clc_length.tab
Phel_clc_contig_1 CAAATATATGAACGGTTGATTGTCAACGATTAGTACATGTTTTCATTGTTCCCCACGCCCGCCCCCCCCCACTCAAACATTTAAAGTGTGAAATATTATTTATCCACAAATTTCCTTAAACCTGCAAACTTGTCTGCTGTCTCTTATTGGAAGTTATGAAAAAGAACAACGGGTTTTCTTTAAAGGGTCTGCGTGCGATTTTCAACCTTTTGAGTAATAGCAGTTATTTTGATAACCGATTTTTTTCAAAGCTCAACAGCTTTTTAAAATAAGGAATCCTATAATGGCCAAACGAATACTATAAAAATAAGGGTTCTCTTAATTGTATAAAACGTATAATTTTATCAATTTTGGGACCGTGTAATTTTTTAAAGACCACAAGAATGTTACATACAACAAATAGACGAAACTCGTAGCTTTGGAAACTACGTCATGGGCGTTTGGTCAAAAGCTGGAGAGAAAGAGAGGTGGGGTGCCAGACTTAAGTAGTCACGTGATCTGACCAACGCACATCGGAAGCTCGATCGGATGAAATCTTCTCTATCGTTCTTGCGTCTATACGTGCTACGAAGAGCTGACAGAAGTTTGGACTTGTTTACTTCTTGCACCTGTTGATGGAACGGCCACGGACCTTGTCGCACGCACACCTGGAGCCAGTGCTCGGATCGACGCAACGGATGTACTGTCTTCCCCTTCCGCGTTTCTCAAGTAGGTACTCAAAGTCGTCCGCGTCGAAGTTGGCCTCGGCGTCCCTCTTCTCCAGCTCCTCCATGTCCTCCTCTGTGTAGTACGGGGTGACGAGCACCACCAGGGCGGCCACAATGGCCAGTGCTAGAAGACACTTCGTATTCATTCTGCTGGTGGTTGGATGTGCGCAAACAAGACAGGAGAGACTTATTAGAATC 905
#lets add header
!echo "contig\tsequence\tseq_len" >> fa.head
!cat fa.head Phel_transcriptome_clc_length.tab > Phel_transcriptome_clc_length2.tab
!head -2 Phel_transcriptome_clc_length2.tab
contig sequence seq_len Phel_clc_contig_1 CAAATATATGAACGGTTGATTGTCAACGATTAGTACATGTTTTCATTGTTCCCCACGCCCGCCCCCCCCCACTCAAACATTTAAAGTGTGAAATATTATTTATCCACAAATTTCCTTAAACCTGCAAACTTGTCTGCTGTCTCTTATTGGAAGTTATGAAAAAGAACAACGGGTTTTCTTTAAAGGGTCTGCGTGCGATTTTCAACCTTTTGAGTAATAGCAGTTATTTTGATAACCGATTTTTTTCAAAGCTCAACAGCTTTTTAAAATAAGGAATCCTATAATGGCCAAACGAATACTATAAAAATAAGGGTTCTCTTAATTGTATAAAACGTATAATTTTATCAATTTTGGGACCGTGTAATTTTTTAAAGACCACAAGAATGTTACATACAACAAATAGACGAAACTCGTAGCTTTGGAAACTACGTCATGGGCGTTTGGTCAAAAGCTGGAGAGAAAGAGAGGTGGGGTGCCAGACTTAAGTAGTCACGTGATCTGACCAACGCACATCGGAAGCTCGATCGGATGAAATCTTCTCTATCGTTCTTGCGTCTATACGTGCTACGAAGAGCTGACAGAAGTTTGGACTTGTTTACTTCTTGCACCTGTTGATGGAACGGCCACGGACCTTGTCGCACGCACACCTGGAGCCAGTGCTCGGATCGACGCAACGGATGTACTGTCTTCCCCTTCCGCGTTTCTCAAGTAGGTACTCAAAGTCGTCCGCGTCGAAGTTGGCCTCGGCGTCCCTCTTCTCCAGCTCCTCCATGTCCTCCTCTGTGTAGTACGGGGTGACGAGCACCACCAGGGCGGCCACAATGGCCAGTGCTAGAAGACACTTCGTATTCATTCTGCTGGTGGTTGGATGTGCGCAAACAAGACAGGAGAGACTTATTAGAATC 905
import pandas as pd
# read data from data file into a pandas DataFrame
Ph = read_table("Phel_transcriptome_clc_length2.tab", # name of the data file
#sep=",", # what character separates each column?
na_values=["", " "]) # what values should be considered "blank" values?
Ph.head()
<class 'pandas.core.frame.DataFrame'> Index: 5 entries, Phel_clc_contig_1 to Phel_clc_contig_5 Data columns (total 3 columns): contig 0 non-null values sequence 5 non-null values seq_len 5 non-null values dtypes: float64(1), int64(1), object(1)
%pylab inline
Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['load', 'info', 'save', 'datetime', 'set_printoptions', 'unique'] `%matplotlib` prevents importing * from pylab and numpy
Ph ['seq_len'].hist(bins=1000);
#Axis limits are changed using the axis([xmin, xmax, ymin, ymax]) function.
plt.axis([0, 10000, 0, 5000])
[0, 10000, 0, 5000]
!head -2 Phel_transcriptome_clc_length2.tab
contig sequence seq_len Phel_clc_contig_1 CAAATATATGAACGGTTGATTGTCAACGATTAGTACATGTTTTCATTGTTCCCCACGCCCGCCCCCCCCCACTCAAACATTTAAAGTGTGAAATATTATTTATCCACAAATTTCCTTAAACCTGCAAACTTGTCTGCTGTCTCTTATTGGAAGTTATGAAAAAGAACAACGGGTTTTCTTTAAAGGGTCTGCGTGCGATTTTCAACCTTTTGAGTAATAGCAGTTATTTTGATAACCGATTTTTTTCAAAGCTCAACAGCTTTTTAAAATAAGGAATCCTATAATGGCCAAACGAATACTATAAAAATAAGGGTTCTCTTAATTGTATAAAACGTATAATTTTATCAATTTTGGGACCGTGTAATTTTTTAAAGACCACAAGAATGTTACATACAACAAATAGACGAAACTCGTAGCTTTGGAAACTACGTCATGGGCGTTTGGTCAAAAGCTGGAGAGAAAGAGAGGTGGGGTGCCAGACTTAAGTAGTCACGTGATCTGACCAACGCACATCGGAAGCTCGATCGGATGAAATCTTCTCTATCGTTCTTGCGTCTATACGTGCTACGAAGAGCTGACAGAAGTTTGGACTTGTTTACTTCTTGCACCTGTTGATGGAACGGCCACGGACCTTGTCGCACGCACACCTGGAGCCAGTGCTCGGATCGACGCAACGGATGTACTGTCTTCCCCTTCCGCGTTTCTCAAGTAGGTACTCAAAGTCGTCCGCGTCGAAGTTGGCCTCGGCGTCCCTCTTCTCCAGCTCCTCCATGTCCTCCTCTGTGTAGTACGGGGTGACGAGCACCACCAGGGCGGCCACAATGGCCAGTGCTAGAAGACACTTCGTATTCATTCTGCTGGTGGTTGGATGTGCGCAAACAAGACAGGAGAGACTTATTAGAATC 905
!awk '{print ">"$1,$2}' /Users/sr320/Desktop/Phel_transcriptome_clc_length2.tab | sort -g -r
!fgrep "Phel_clc_contig_2671" Phel_transcriptome_clc_length2.tab | pbcopy
mkdir scripts
cd scripts
/Users/sr320/Desktop/scripts
!curl -O https://raw.githubusercontent.com/sr320/eimd/master/scripts/count_fasta.pl
% Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 15426 0 15426 0 0 22283 0 --:--:-- --:--:-- --:--:-- 22356
cd ..
/Users/sr320/Desktop
!perl /Users/sr320/Dropbox/Steven/eimd/scripts/count_fasta.pl -i 10000 Phel_transcriptome_clc.fa
0:9999 30486 10000:19999 58 20000:29999 11 30000:39999 9 40000:49999 7 50000:59999 3 60000:69999 2 70000:79999 1 80000:89999 0 90000:99999 0 100000:109999 1 Total length of sequence: 43945151 bp Total number of sequences: 30578 N25 stats: 25% of total sequence length is contained in the 1930 sequences >= 3393 bp N50 stats: 50% of total sequence length is contained in the 6425 sequences >= 1848 bp N75 stats: 75% of total sequence length is contained in the 14670 sequences >= 977 bp Total GC count: 18236282 bp GC %: 41.50 %
!perl /Users/sr320/Dropbox/Steven/eimd/scripts/count_fasta.pl -i 1000 ../data/Phel_transcriptome_clc_v3.fasta
0:999 15505 1000:1999 8559 2000:2999 3003 3000:3999 1240 4000:4999 580 5000:5999 312 6000:6999 129 7000:7999 72 8000:8999 31 9000:9999 19 10000:10999 9 11000:11999 7 12000:12999 2 13000:13999 4 14000:14999 3 15000:15999 0 16000:16999 1 Total length of sequence: 40747496 bp Total number of sequences: 29476 N25 stats: 25% of total sequence length is contained in the 2260 sequences >= 3085 bp N50 stats: 50% of total sequence length is contained in the 6715 sequences >= 1757 bp N75 stats: 75% of total sequence length is contained in the 14612 sequences >= 959 bp Total GC count: 16459121 bp GC %: 40.39 %
Blast - lets get comfortable with commandline; maybe add to our PATH
Download fasta http://www.uniprot.org/downloads