Perl--尋找兩個基因組中的unique序列

原始數據:

基因組的fasta文件,兩個基因組之間blast的比對座標

scaffold1 2723   9022   contig00005    238996   1

解決方法一:

#!/usr/bin/perl -w 

use strict;
use warnings;
no strict "refs";

my $usage = " perl $0 mutation-vs-wild.m8 mutation.fa wild.fa \n";

unless(@ARGV == 3){
          die $usage;
     }


my $aln  = shift ;
my $mut  = shift;
my $wild = shift;

open ALN,"<",$aln or die "can't open $aln\n";
#open MUT,"<",$mut or die "can't open $mut\n";
#open WILD,"<",$wild or die "can't open $wild\n";

my %mut = build_fa($mut);
my %wild = build_fa($wild);

coden(\%mut);
coden(\%wild);

while(<ALN>){
       chomp;
       my ($scf,$st1,$ed1,$ctg,$st2,$ed2) = split"\t",$_;
       coden_aln($scf,$st1,$ed1);
       coden_aln($ctg,$st2,$ed2);
 }

my %want_mut = get_coord(\%mut);
my %want_wild = get_coord(\%wild);

get_fa_seq ("mut",\%want_mut,\%mut);
get_fa_seq ("wild",\%want_wild,\%wild);


sub build_fa {
         my $file = shift @_;
         my %fa;
         my $tit;

         open FA,"<",$file or die "can't open $file\n";
         while(<FA>){
              chomp;
              if(/>(.*)/){
                 $tit = $1;
               }else{
                 $fa{$tit} .= $_;
      }
    }
     return %fa;
}


sub coden {
         my $hash = shift @_;
         foreach my $key (keys %$hash){
         foreach my $i (1 .. length $hash->{$key}){
                 ${$key}[$i] = 0;
      }
   }
}



sub coden_aln {
    my ($scf,$st,$ed) = @_;
    if($st > $ed){
         ($st,$ed) = ($ed,$st);
   }
   
    foreach my $i ($st .. $ed){
           ${$scf}[$i] =1;
    }
}




sub get_coord {
       my %want;
       my $hash = shift @_;
       my $flag;
            foreach my $key (keys %$hash){
            my $start;
            my $end;
            if($key =~/scaffold/){
                   $flag = "mut";
            }else{ 
                   $flag = "wild";
           }

          foreach my $i (1.. length $hash->{$key}){
                 if($i == 1 && ${$key}[1] == 0){
                     $start = 1;
                }elsif(${$key}[$i] == 0 && ${$key}[$i-1] ==1){
                     $start = $i;
                }
                 if($i == length $hash->{$key} && ${$key}[$i]==0 ){
                     $end = $i;
                 }elsif(${$key}[$i] == 0 && ${$key}[$i+1] == 1){
                     $end = $i;
     }
       if($start && $end && $start <= $end ){
         $want{$key}{$start} = $end;
     }
   }
  }
       open MUT_CORD,">",$flag."_coord";
       foreach my $key (keys %want){
              foreach my $st (keys %{$want{$key}}){
                         my $end = $want{$key}{$st};
                         print MUT_CORD $key,"\t",$st,"\t",$want{$key}{$st},"\n";

      }
   }
      close MUT_CORD;
      return %want;
}


sub get_fa_seq {
           my ($flag,$want,$fa) =  @_;
           open MUT_FA,">",$flag."_uniq.fa";
           foreach my $key (keys %$fa){
                   foreach my $cord (keys %{$want->{$key}}){
                        my $start = $cord;
                        my $end = $want->{$key}{$cord};
                         my $seq =substr ($fa->{$key},$start-1,$end-$start+1);
                         print MUT_FA ">$key  $start->$end\n$seq\n";
       }
    }
      close MUT_FA;
}



發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章