最近需要在NCBI中下載所有Xanthomonas屬菌株對應的gbk文件,由於NCBI前臺gbk數據已經改版,故打算從後臺ftp.ncbi.nlm.nih.gov下載。寫了個Perl腳本用於批量下載NCBI後臺數據,有這方面需求的同仁們可以參考。另外,多進程暫時未成功,後期再更改。
#!/usr/bin/perl
##################################################################################################
# NCBI_ftp_batch_fetch.pl
# 黃良博 [email protected]
# 2015-3-25
# 用途:可用於在NCBI的後臺ftp.ncbi.nlm.nih.gov/genomes/Bacteria中下載某一個屬對應的文件(gbk,fna,gbs,ptt等均可)。
# 下載Xanthomonas屬中所有菌株的gbk文件和fna文件:
# perl $0 -g Xanthomonas -s gbk -s fna
# 注意:多進程暫時不能使用
##################################################################################################
use v5.16;
use warnings;
use Net::FTP;
use Cwd;
use Getopt::Long;
use Parallel::ForkManager;
my($genus,@suffix,$thread,$help);
GetOptions(
"genus:s" =>\$genus,
"suffix:s" =>\@suffix,
#"thread:i" =>\$thread,
"help" =>\$help,
);
#如果命令行上有多餘參數,則報錯並退出
if (@ARGV!=0){
say 'Please check your parameters, maybe you forgot to add "" or _ to genus';
exit;
}
my $usage=<<USAGE;
Please enter parameters!
-help help
-genus genus
-suffix file type(eg. gbk,fna,ptt,...)
-thread number of thread
Usage:
perl $0 -g genus -s suffix
eg.
perl $0 -g Xanthomonas -s gbk -s fna
USAGE
if ($help or not defined $genus or @suffix==0){
say "$usage";
exit();
}
#----------------------------------------
my $host="ftp.ncbi.nlm.nih.gov";
my $directory="/genomes/Bacteria";
my ($newerr,$ftp,@files);
$ftp=Net::FTP->new($host,Timeout=>240)
or die "Can't ftp to $host: $!\n";
say "Connected";
$ftp->login("ftp","apl\@")
or die "Can't login to $host: $!\n";
say "Getting file list...";
$ftp->cwd($directory)
or die "Can't cd $!\n";
#建立下載目錄
my $restore_dir = "C:/Users/liangbo/Desktop/download"; #設定下載路徑
unless (-d $restore_dir){
mkdir $restore_dir, 0755 or die "Cannot make directory: $!";
}
chdir $restore_dir;
@files=$ftp->dir
or die "Can't get file list $!\n";
say "Got file list";
#如路徑爲Xanthomonas_oryzae_KACC_10331_uid58155
#利用qr在編譯時內插,提升性能
$genus=~s/\s+/_/g;
my $regex=qr/($genus.*)/;
my %count; #統計信息
my $count_strains=0;
my ($filename,$path);
$thread||=0; #默認爲單進程
my $pm=Parallel::ForkManager->new($thread);
for(@files){
$pm->start and next;
if(/$regex/){
$count_strains++; #統計找到的菌株數目
for my $ext (@suffix){
my $retrive="/genomes/Bacteria/$1/*.$ext";
my @file=$ftp->dir($retrive);
for (@file){
if(/(\/genomes\/Bacteria\/(.*\.$ext))/){
($filename,$path)=($2,$1);
say $filename; #輸出找到的菌株
$count{$ext}{found}++; #統計找到的文件數目
#download,注意此處對應的路徑爲/genomes/Bacteria/Xanthomonas_.../*.gbk
#將形如Xanthomonas_oryzae_KACC_10331_uid58155/NC_006834.gbk文件名改爲
#Xanthomonas_oryzae_KACC_10331_uid58155_NC_006834.gbk,並以此命名
$filename=~s/\//_/g;
$ftp->get($path,$filename)
or die "get failed ", $ftp->message;
$count{$ext}{got}++; #統計成功下載的文件數目
}
}
}
}
$pm->finish;
}
$pm->wait_all_children;
$ftp->quit;
my $cwd=getcwd();
#輸出統計信息
if($count_strains){
say "Found:";
say "Strains: $count_strains";
for my $ext (keys %count){
say "$ext files:";
for my $item (sort keys %{$count{$ext}}){
say "\t$item\t$count{$ext}{$item}";
}
}
say "Files downloaded in $cwd";
}else{
say "Attention: Nothing found, check your genus input!";
}