問題描述
之前用blast2go太慢,自己寫了個Perl腳本進行遠程blast,可以將fasta文件拆分後並行運行。最後生成XML文件可以直接導入blast2go軟件使用。速度比直接利用blast2go-basic快些。
代碼塊
#!/usr/bin/perl
# huangliangbo 2015-3-30
# 利用該腳本進行遠程blast,生成XML格式(m7)文件;
# 需要安裝XML::SAX模塊
use v5.16;
use warnings;
use Bio::Tools::Run::RemoteBlast;
use Getopt::Long;
use File::Basename;
my $basename=basename($0);
my %opt;
GetOptions(\%opt,"program:s","database:s","file:s","evalue","v:i","b:i","gi","help");
my $help=<<USAGE;
Please enter parameters:
-help print help information
-program string [ blastp, blastn, blastx, tblastn, tblastx ;default: blastp ]
-database string [ swissprot, nr, nt, etc... ]
-file string [ fasta, contains one or more fasta format sequences ]
-evalue string [ default: '1e-5' ]
-v Integer[ Number of database sequences to show one-line descriptions for (V); default:10 ]
-b Integer[ Number of database sequence to show alignments for (B); default:10 ]
-gi [T/F] [ Show GI in deflines,default:T ]
usage:
perl $basename -p program -d database -f file [-e e-value] [-b 5] [-v 5]
USAGE
if($opt{help} or keys %opt < 3){
say $help;
exit;
}
my $prog = $opt{program};
my $db = $opt{database};
my $file = $opt{file};
my $e_val = $opt{evalue}||'1e-5';
my $description = $opt{v}||10;
my $alignment = $opt{b}||10;
my $gi = $opt{gi}||'T';
my @params = ( '-prog' => $prog,
'-data' => $db,
'-expect' => $e_val,
'-readmethod' => 'xml' );
my $factory = Bio::Tools::Run::RemoteBlast->new(@params);
#$factory->retrieve_parameter('FORMAT_TYPE', 'XML');
#change a query paramter
#$Bio::Tools::Run::RemoteBlast::HEADER{'ENTREZ_QUERY'} = 'Homo sapiens [ORGN]';
#$Bio::Tools::Run::RemoteBlast::HEADER{'MATRIX_NAME'} = 'PAM30';
#$Bio::Tools::Run::RemoteBlast::HEADER{'GAPCOSTS'} = '9 1';
#$Bio::Tools::Run::RemoteBlast::HEADER{'WORD_SIZE'} = '2';
#Have to request the blast with the right amount of alignments,
#$Bio::Tools::Run::RemoteBlast::HEADER{'ALIGNMENTS'} = '1000';
#$Bio::Tools::Run::RemoteBlast::HEADER{'DESCRIPTIONS'} = '1000';
#change a retrieval parameter
$Bio::Tools::Run::RemoteBlast::RETRIEVALHEADER{'DESCRIPTIONS'} = $description;
$Bio::Tools::Run::RemoteBlast::RETRIEVALHEADER{'NCBI_GI'} = $gi=~/f/i?'no':'yes';
$Bio::Tools::Run::RemoteBlast::RETRIEVALHEADER{'ALIGNMENTS'} = $alignment;
$Bio::Tools::Run::RemoteBlast::RETRIEVALHEADER{'FORMAT_TYPE'} = 'XML';
#remove a parameter
#delete $Bio::Tools::Run::RemoteBlast::HEADER{'FILTER'};
#$v is just to turn on and off the messages
#$v is just a way to either print out messages or not. So you can toggle whether you want the messages like "waiting ..."
my $v = 1;
my $str = Bio::SeqIO->new(-file=>$file , -format => 'fasta' );
my $num = 1;
while (my $input = $str->next_seq()){
#Blast a sequence against a database:
#Alternatively, you could pass in a file with many
#sequences rather than loop through sequence one at a time
#Remove the loop starting 'while (my $input = $str->next_seq())'
#and swap the two lines below for an example of that.
my $r = $factory->submit_blast($input);
print STDERR "\n$num: waiting...\n" if( $v > 0 );
$num ++;
while ( my @rids = $factory->each_rid ) {
foreach my $rid ( @rids ) {
my $rc = $factory->retrieve_blast($rid);
if( !ref($rc) ) {
if( $rc < 0 ) {
$factory->remove_rid($rid);
}
print STDERR "." if ( $v > 0 );
sleep 5;
} else {
my $result = $rc->next_result();
#save the output
my $filename = $result->query_name()."\.xml";
$factory->save_output($filename);
`cat $filename >> blastRemoteResult.xml; rm $filename`;
$factory->remove_rid($rid);
}
}
}
}