批量從NCBI後臺下載指定數據的Perl腳本

最近需要在NCBI中下載所有Xanthomonas屬菌株對應的gbk文件,由於NCBI前臺gbk數據已經改版,故打算從後臺ftp.ncbi.nlm.nih.gov下載。寫了個Perl腳本用於批量下載NCBI後臺數據,有這方面需求的同仁們可以參考。另外,多進程暫時未成功,後期再更改。

#!/usr/bin/perl
##################################################################################################
# NCBI_ftp_batch_fetch.pl
# 黃良博 [email protected]
# 2015-3-25
# 用途:可用於在NCBI的後臺ftp.ncbi.nlm.nih.gov/genomes/Bacteria中下載某一個屬對應的文件(gbk,fna,gbs,ptt等均可)。
# 下載Xanthomonas屬中所有菌株的gbk文件和fna文件:
# perl $0 -g Xanthomonas -s gbk -s fna
# 注意:多進程暫時不能使用
##################################################################################################

use v5.16;
use warnings;
use Net::FTP;
use Cwd;
use Getopt::Long;
use Parallel::ForkManager;

my($genus,@suffix,$thread,$help);

GetOptions(
    "genus:s"   =>\$genus,
    "suffix:s"  =>\@suffix,
    #"thread:i" =>\$thread,
    "help"      =>\$help,
);
#如果命令行上有多餘參數,則報錯並退出
if (@ARGV!=0){
    say 'Please check your parameters, maybe you forgot to add "" or _ to genus';
    exit;
}

my $usage=<<USAGE;
Please enter parameters!
-help       help
-genus      genus
-suffix     file type(eg. gbk,fna,ptt,...)
-thread     number of thread
Usage:
  perl $0 -g genus -s suffix
eg.
  perl $0 -g Xanthomonas -s gbk -s fna
USAGE

if ($help or not defined $genus or @suffix==0){
    say "$usage";
    exit();
}

#----------------------------------------
my $host="ftp.ncbi.nlm.nih.gov";
my $directory="/genomes/Bacteria";

my ($newerr,$ftp,@files);

$ftp=Net::FTP->new($host,Timeout=>240) 
    or die "Can't ftp to $host: $!\n";

say "Connected";

$ftp->login("ftp","apl\@") 
    or die "Can't login to $host: $!\n";

say "Getting file list...";

$ftp->cwd($directory) 
    or die "Can't cd  $!\n";

#建立下載目錄
my $restore_dir = "C:/Users/liangbo/Desktop/download"; #設定下載路徑
unless (-d $restore_dir){
    mkdir $restore_dir, 0755 or die "Cannot make directory: $!";
}
chdir $restore_dir;

@files=$ftp->dir
    or die "Can't get file list $!\n";

say "Got file list";

#如路徑爲Xanthomonas_oryzae_KACC_10331_uid58155
#利用qr在編譯時內插,提升性能
$genus=~s/\s+/_/g;
my $regex=qr/($genus.*)/;

my %count;  #統計信息
my $count_strains=0;
my ($filename,$path);

$thread||=0;   #默認爲單進程
my $pm=Parallel::ForkManager->new($thread);

for(@files){
    $pm->start and next;
    if(/$regex/){
        $count_strains++;  #統計找到的菌株數目
        for my $ext (@suffix){
            my $retrive="/genomes/Bacteria/$1/*.$ext";
            my @file=$ftp->dir($retrive);
            for (@file){
                if(/(\/genomes\/Bacteria\/(.*\.$ext))/){
                    ($filename,$path)=($2,$1);
                    say $filename; #輸出找到的菌株
                    $count{$ext}{found}++;    #統計找到的文件數目
                    #download,注意此處對應的路徑爲/genomes/Bacteria/Xanthomonas_.../*.gbk
                    #將形如Xanthomonas_oryzae_KACC_10331_uid58155/NC_006834.gbk文件名改爲
                    #Xanthomonas_oryzae_KACC_10331_uid58155_NC_006834.gbk,並以此命名
                    $filename=~s/\//_/g;
                    $ftp->get($path,$filename) 
                        or die "get failed ", $ftp->message;
                    $count{$ext}{got}++;  #統計成功下載的文件數目
                }
            }
        }
    }
    $pm->finish;
}

$pm->wait_all_children;
$ftp->quit;

my $cwd=getcwd();

#輸出統計信息
if($count_strains){
    say "Found:";
    say "Strains: $count_strains";
    for my $ext (keys %count){
        say "$ext files:";
        for my $item (sort keys %{$count{$ext}}){
            say "\t$item\t$count{$ext}{$item}";
        }
    }
    say "Files downloaded in $cwd";
}else{
    say "Attention: Nothing found, check your genus input!";
}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章