#test file
!head -2 /Volumes/web/scaphapoda/Grace/Transcriptomes/mercenaria/query.fa
!echo
!echo number of seqs =
!fgrep -c ">" /Volumes/web/scaphapoda/Grace/Transcriptomes/mercenaria/query.fa
>Mmercenaria_Contig_1 AAGAGTGACTGGTACCACCTGTGTACTACAATGGTTATTTGATACAACTAAATGTAAGCGGTACCACCATGTATTACAATGTGAAATTAGTATCAATAAGTGTGGCTGGTACCTTTATATATTACAGGTGCTGTTATGTTTGACAGGAATACTGATGTGAGATAGTTACTTCCATACTATGTGTAACCTACGGTCCGGCACGTTGAATGGTGGGGTG number of seqs = 8482
cd /Volumes/web/scaphapoda/Grace/Transcriptomes/mercenaria/
/Volumes/web/scaphapoda/Grace/Transcriptomes/mercenaria
ls
HardClam_contigs.fa blast_sprot.sql.tab blast_sprot.tab blast_sprot_sql.tab me_descriptions.txt me_reprot.txt me_reprotcolumns.txt me_reprotcolumns2.txt me_reprotcolumns3.txt me_reprotcolumns4.csv query.fa
!head -2 me_descriptions.txt
!awk -F"\t" '{print $1, "\t", $3}' \
/Volumes/web/scaphapoda/Grace/Transcriptomes/mercenaria/me_descriptions.txt \
> /Volumes/web/scaphapoda/Grace/Transcriptomes/mercenaria/merc_slim
!head /Volumes/web/scaphapoda/Grace/Transcriptomes/mercenaria/merc_slim
Column1 GOSlim_bin Mmercenaria_Contig_1373 developmental processes Mmercenaria_Contig_1373 cell organization and biogenesis Mmercenaria_Contig_1373 cell organization and biogenesis Mmercenaria_Contig_1373 other biological processes Mmercenaria_Contig_6522 RNA metabolism Mmercenaria_Contig_6522 RNA metabolism Mmercenaria_Contig_6522 RNA metabolism Mmercenaria_Contig_6522 RNA metabolism Mmercenaria_Contig_369 protein metabolism
cd /Volumes/web/cnidarian/
/Volumes/web/cnidarian
mkdir merc_cpg
mkdir: merc_cpg: File exists
cd merc_cpg/
/Volumes/web/cnidarian/merc_cpg
!wget http://eagle.fish.washington.edu/scaphapoda/Grace/Transcriptomes/mercenaria/query.fa
--2014-11-01 14:44:00-- http://eagle.fish.washington.edu/scaphapoda/Grace/Transcriptomes/mercenaria/query.fa Resolving eagle.fish.washington.edu... 128.95.149.81 Connecting to eagle.fish.washington.edu|128.95.149.81|:80... connected. HTTP request sent, awaiting response... 200 OK Length: 2421307 (2.3M) [text/plain] Saving to: `query.fa' 100%[======================================>] 2,421,307 1.46M/s in 1.6s 2014-11-01 14:44:07 (1.46 MB/s) - `query.fa' saved [2421307/2421307]
#overarching concept is place file named query in working directory and all below files are generated using a generic file name
!perl -e '$count=0; $len=0; while(<>) {s/\r?\n//; s/\t/ /g; if (s/^>//) { if ($. != 1) {print "\n"} s/ |$/\t/; $count++; $_ .= "\t";} else {s/ //g; $len += length($_)} print $_;} print "\n"; warn "\nConverted $count FASTA records in $. lines to tabular format\nTotal sequence length: $len\n\n";' \
query.fa > fasta2tab
Converted 8482 FASTA records in 16964 lines to tabular format Total sequence length: 2201882
!head -2 fasta2tab
Mmercenaria_Contig_1 AAGAGTGACTGGTACCACCTGTGTACTACAATGGTTATTTGATACAACTAAATGTAAGCGGTACCACCATGTATTACAATGTGAAATTAGTATCAATAAGTGTGGCTGGTACCTTTATATATTACAGGTGCTGTTATGTTTGACAGGAATACTGATGTGAGATAGTTACTTCCATACTATGTGTAACCTACGGTCCGGCACGTTGAATGGTGGGGTG Mmercenaria_Contig_2 ACAGCTGTCTGATTACTTATACAAAGAACACGGGTTTAAAGCAGAAATGATTGATACTCTGTACAACTATGCCAAGTTTCAGTATGAATGTGGTAATTATTCTGCAGCAGCTGAATATCTCTACTTTGTTAGAATCCTGCTACCACCAAATGACAGAAATTACTTGAATGCATTATGGGGGAAGTTAGCTTCAGAGATTCTCATGCAAACGACCAGTG
#
#
# NOTE THIS REQUIRES File specific knowledge
#
##to make sure we don't get description name issues
!sed 's/Mmercenaria_Contig/999999/g' <fasta2tab> fasta2tab_c
#add column with length of sequence
!perl -e '$col = 2;' -e 'while (<>) { s/\r?\n//; @F = split /\t/, $_; $len = length($F[$col]); print "$_\t$len\n" } warn "\nAdded column with length of column $col for $. lines.\n\n";' \
fasta2tab_c > tab_1
Added column with length of column 2 for 8482 lines.
!wc tab_1
8482 25446 2353451 tab_1
#this counts CGs
!awk -F\CG '{print NF-1}' tab_1 > CG
!awk -F\C '{print NF-1}' tab_1 > C
!awk -F\G '{print NF-1}' tab_1 > G
!paste tab_1 \
CG \
C \
G \
> comb
!head -1 comb
999999_1 AAGAGTGACTGGTACCACCTGTGTACTACAATGGTTATTTGATACAACTAAATGTAAGCGGTACCACCATGTATTACAATGTGAAATTAGTATCAATAAGTGTGGCTGGTACCTTTATATATTACAGGTGCTGTTATGTTTGACAGGAATACTGATGTGAGATAGTTACTTCCATACTATGTGTAACCTACGGTCCGGCACGTTGAATGGTGGGGTG 217 4 34 52
!awk '{print $1, "\t", (($4)/($5*$6))*(($3**2)/($3-1))}' comb > ID_CpG
!head ID_CpG
999999_1 0.493223 999999_2 0.254657 999999_3 0.129997 999999_4 0.95738 999999_5 1.15181 999999_6 0.40192 999999_7 0.145004 999999_8 0.71746 999999_9 0.346284 999999_10 0.442564
!sed 's/999999/Mmercenaria_Contig/g' <ID_CpG> ID_CpG.tab
!head ID_CpG.tab
!echo line number
!wc -l ID_CpG.tab
Mmercenaria_Contig_1 0.493223 Mmercenaria_Contig_2 0.254657 Mmercenaria_Contig_3 0.129997 Mmercenaria_Contig_4 0.95738 Mmercenaria_Contig_5 1.15181 Mmercenaria_Contig_6 0.40192 Mmercenaria_Contig_7 0.145004 Mmercenaria_Contig_8 0.71746 Mmercenaria_Contig_9 0.346284 Mmercenaria_Contig_10 0.442564 line number 8482 ID_CpG.tab
!sort ID_CpG.tab > t1
!head t1
Mmercenaria_Contig_1 0.493223 Mmercenaria_Contig_10 0.442564 Mmercenaria_Contig_100 0.891529 Mmercenaria_Contig_1000 0.724268 Mmercenaria_Contig_1001 0.860937 Mmercenaria_Contig_1002 0.48348 Mmercenaria_Contig_1003 0.543491 Mmercenaria_Contig_1004 0.692323 Mmercenaria_Contig_1005 0.461004 Mmercenaria_Contig_1006 0.871567
!sort /Volumes/web/scaphapoda/Grace/Transcriptomes/mercenaria/merc_slim > t2
!head t2
Column1 GOSlim_bin Mmercenaria_Contig_1 DNA metabolism Mmercenaria_Contig_1 DNA metabolism Mmercenaria_Contig_1 other metabolic processes Mmercenaria_Contig_10 cell cycle and proliferation Mmercenaria_Contig_10 cell organization and biogenesis Mmercenaria_Contig_10 other biological processes Mmercenaria_Contig_10 other biological processes Mmercenaria_Contig_100 cell cycle and proliferation Mmercenaria_Contig_100 cell cycle and proliferation
!join -t $'\t' t1 t2 | uniq > t3
!head t3
Mmercenaria_Contig_1 0.493223 DNA metabolism Mmercenaria_Contig_1 0.493223 other metabolic processes Mmercenaria_Contig_10 0.442564 cell cycle and proliferation Mmercenaria_Contig_10 0.442564 cell organization and biogenesis Mmercenaria_Contig_10 0.442564 other biological processes Mmercenaria_Contig_100 0.891529 cell cycle and proliferation Mmercenaria_Contig_100 0.891529 cell organization and biogenesis Mmercenaria_Contig_100 0.891529 other biological processes Mmercenaria_Contig_1000 0.724268 other metabolic processes Mmercenaria_Contig_1000 0.724268 protein metabolism
!wc -l t3
12871 t3
jData = pd.read_table('t3', header=None)
jData
<class 'pandas.core.frame.DataFrame'> Int64Index: 12871 entries, 0 to 12870 Data columns (total 3 columns): 0 12871 non-null values 1 12871 non-null values 2 12871 non-null values dtypes: float64(1), object(2)
jData.groupby(2)[1].mean().plot(kind='barh', color=list('myb'))
plt.axis([0.5, 0.7, 0, 14])
[0.5, 0.7, 0, 14]
# pandas density plot
jData[1].plot(kind='kde', linewidth=3);
plt.axis([-0.3, 1.5, 0, 1.7])
[-0.3, 1.5, 0, 1.7]
!python /Users/sr320/sqlshare-pythonclient/tools/singleupload.py \
-d merc_ID_CpG \
ID_CpG.tab
processing chunk line 0 to 8482 (0.342339992523 s elapsed) pushing ID_CpG.tab... parsing E6A86761... finished merc_ID_CpG
!python /Users/sr320/sqlshare-pythonclient/tools/singleupload.py \
-d merc_slim \
/Volumes/web/scaphapoda/Grace/Transcriptomes/mercenaria/merc_slim
processing chunk line 0 to 26861 (0.010213136673 s elapsed) pushing /Volumes/web/scaphapoda/Grace/Transcriptomes/mercenaria/merc_slim... parsing ECA84710... finished merc_slim
!python /Users/sr320/sqlshare-pythonclient/tools/fetchdata.py \
-s "SELECT * FROM [sr320@washington.edu].[merc_slim]sl left join [sr320@washington.edu].[merc_ID_CpG_1.tab]cpg on sl.Column1=cpg.Column1" \
-f tsv \
-o mercCPG_slim.txt
!wc -l mercCPG_slim.txt
26861 mercCPG_slim.txt
!uniq mercCPG_slim.txt | wc -l
19920
#removing duplicate go slims
#
# NOte could have also uniq goslim file before joiing
#
!uniq mercCPG_slim.txt > cpg_slim.txt
!head mercCPG_slim.txt
!head cpg_slim.txt
!wc -l cpg_slim.txt
19920 cpg_slim.txt
import pandas as pd
import numpy as np
pylab inline
Populating the interactive namespace from numpy and matplotlib
#pData = pd.read_csv('https://dl.dropbox.com/u/7710864/data/csv_hid/ss06pid.csv')
pData = pd.read_table('cpg_slim.txt')
pData
<class 'pandas.core.frame.DataFrame'> Int64Index: 19919 entries, 0 to 19918 Data columns (total 4 columns): Column1 19919 non-null values GOSlim_bin 19919 non-null values Column1.1 19919 non-null values Column2 19919 non-null values dtypes: float64(1), object(3)
pData.groupby('GOSlim_bin')['Column2'].mean().plot(kind='barh', color=list('myb'))
plt.axis([0.5, 0.7, 0, 14])
[0.5, 0.7, 0, 14]
pData.groupby('GOSlim_bin')['Column2'].median().plot(kind='barh', color=list('myb'))
plt.axis([0.5, 0.75, 0, 14])
[0.5, 0.75, 0, 14]
pData.groupby('GOSlim_bin')['Column2'].count().plot(kind='barh')
#plt.axis([0.5, 0.75, 0, 14])
<matplotlib.axes.AxesSubplot at 0x104f10610>
pData['Column2'].hist(bins=400);
#Axis limits are changed using the axis([xmin, xmax, ymin, ymax]) function.
plt.axis([0, 1.5, 0, 250])
[0, 1.5, 0, 250]
# pandas density plot
pData['Column2'].plot(kind='kde', linewidth=3);
plt.axis([-0.3, 1.5, 0, 1.7])
[-0.3, 1.5, 0, 1.7]
pData['Column2'].hist(by=pData['GOSlim_bin'])
array([[<matplotlib.axes.AxesSubplot object at 0x111d2d210>, <matplotlib.axes.AxesSubplot object at 0x116b79b90>, <matplotlib.axes.AxesSubplot object at 0x1155aea50>, <matplotlib.axes.AxesSubplot object at 0x11481d1d0>], [<matplotlib.axes.AxesSubplot object at 0x1115c0d10>, <matplotlib.axes.AxesSubplot object at 0x114d0cb50>, <matplotlib.axes.AxesSubplot object at 0x115ceae50>, <matplotlib.axes.AxesSubplot object at 0x116b9b7d0>], [<matplotlib.axes.AxesSubplot object at 0x116aed2d0>, <matplotlib.axes.AxesSubplot object at 0x114e778d0>, <matplotlib.axes.AxesSubplot object at 0x1157de6d0>, <matplotlib.axes.AxesSubplot object at 0x11596f250>], [<matplotlib.axes.AxesSubplot object at 0x115522c50>, <matplotlib.axes.AxesSubplot object at 0x116c82110>, <matplotlib.axes.AxesSubplot object at 0x11569f3d0>, <matplotlib.axes.AxesSubplot object at 0x116db4150>]], dtype=object)