import pandas as pd
import os
from ftplib import FTP
bdir='/cellar/users/btsui/Data/SRA/DUMP/'
untaredDir='/nrnb/users/btsui/tmp/SRA_META/'
ftpLink='ftp.ncbi.nlm.nih.gov'
myRemoteDir='sra/reports/Metadata/'
ftp = FTP(ftpLink)
ftp.login()
ftp.cwd(myRemoteDir)
fnames=pd.Series(ftp.nlst())
#sort the data numerically
myFullSraMeta=fnames[fnames.str.contains('NCBI_SRA_Metadata_Full')
].sort_values().iloc[-1]
myDownloadFnames=['SRA_Accessions.tab',myFullSraMeta,
'SRA_Run_Members.tab']
#!ls -lah /cellar/users/btsui/Data/SRA/DUMP/
print ('Files to be downloaded from NCBI:',myDownloadFnames)
Files to be downloaded from NCBI: ['SRA_Accessions.tab', 'NCBI_SRA_Metadata_Full_20181005.tar.gz', 'SRA_Run_Members.tab']
fnames_in_base_dir=pd.Series(os.listdir(bdir))
existing_sra_tar=fnames_in_base_dir[fnames_in_base_dir.str.contains('NCBI_SRA_Metadata.*\.tar\.gz$')].iloc[0]
#existing_sra_tar
#remove existing NCBI_SRA_Metadata tar file
cmd_rm_tar='rm '+bdir+existing_sra_tar
os.system(cmd_rm_tar)
0
%%time
for f in myDownloadFnames:
fileDir = bdir+f
File=open(fileDir,'wb')
###reopen ftp everytime to avoid idling
ftp = FTP(ftpLink)
ftp.login()
ftp.cwd(myRemoteDir)
ftp.retrbinary('RETR %s' % f, File.write)
File.close()
CPU times: user 1.93 s, sys: 13.5 s, total: 15.5 s Wall time: 5min 36s
#!ls /nrnb/users/btsui/tmp/SRA_META/
os.system('mkdir '+untaredDir)
256
%%time
"""
last untaring time 28m57.656s
"""
tarCmd='time tar --skip-old-files -xvf {inDir} -C {out_dir}'.format(inDir=bdir+myFullSraMeta,
out_dir=untaredDir)
os.system(tarCmd)
exit(0)
!ls -lah /nrnb/users/btsui/tmp/
total 15G drwxr-xr-x 6 btsui users 512 Jan 4 2018 . drwxr-xr-x 6 btsui CarterGeneral 512 Aug 10 2016 .. drwxr-xr-x 11 btsui users 128K Oct 2 2015 METAMAP -rw-r--r-- 1 btsui users 15G Jul 4 2016 NCBI_SRA_Metadata_Full_20160702_Run_2.tar drwxr-xr-x 1046455 btsui users 62M Oct 12 16:17 SRA_META drwxr-xr-x 3 btsui users 512 Aug 31 2015 TSCC drwxr-xr-x 2 btsui users 128K Aug 12 2015 bioSample