[genefuse] 生成genefuse 的fusion.csv文件

最近分析一批panel測序數據,Panel中包含一些針對Fusion的捕獲探針,使用delly等call SV的軟件效果並不好。所以嘗試使用genefuse這個軟件。
genefuse這個軟件是根據一些先驗的可能形成Fusion的基因對,直接從fastq文件檢測Fusion,我覺得可能會被沒有先驗知識的caller的性能更好一些。但是,我做的是血液腫瘤,而官方提供的先驗Fusion文件中沒有包括血液腫瘤中常見的Fusion基因(如RUNX1-RUNX1T1)。官方給了Fusion文件的方法也總是報錯(主要是那個腳本是julia寫的,我不會……)。所以就自己動手豐衣足食了。

根據作者提供的信息以及觀察fusion文件格式,感覺可以從GTF文件中直接抽取相關信息。代碼如下:

 

#!/usr/bin/env python3
 
from collections import OrderedDict
import logging
logging.basicConfig(level=logging.WARNING)
 
 
def main(gtf, genelist,mode):
 
    # read genelist
    with open(genelist, 'r') as li:
        target_genelist = [i.strip() for i in li]
    # initialize geneblock
    gene_block = {
        "empty": True,
        "gene": {},
        "transcript": OrderedDict()
    }
    # read lines
    with open(gtf, 'r') as g:
        for record in g:
 
            # the data structure looks like:
            # {
            # empty = False
            #
            # gene:"DDX41"
            #
            # uniq_gene_id = ENSGxxxxxx
            #
            # transcript(){
            #
            #   ENSTxxxxxxxx.3.xxxx.xxxx----exon {
            #       trasncription_information{}
            #       exon{}
            #       cds{}
            #   }
            # }
 
            if record.strip().startswith("#"):
                continue
 
            a = parse_oneline(record)
            if a['feature'] == "gene":
                if gene_block['empty'] == True:
                    try:
                        gene_block['gene'] = a
                        gene_block['uniq_gene_id'] = ".".join(
                            [a['attr']['gene_id'], a['attr']['gene_version']])
                        gene_block['gene_name'] = a['attr']['gene_name']
                        gene_block['empty'] = False
                    except:
                        raise Exception(
                            "The Freature line could not be inserted into geneblock")
                else:
                    """
                    something to process geneblock
                    """
                    process_gene_block(gene_block, target_genelist,mode)
                    # initialize gene_block (load new gene record)
                    gene_block['empty'] = False
                    gene_block['uniq_gene_id'] = ".".join(
                        [a['attr']['gene_id'], a['attr']['gene_version']])
                    gene_block['gene_name'] = a['attr']['gene_name']
                    gene_block['gene'] = a
                    gene_block['transcript'] = OrderedDict()
            else:
                uniq_trans_id = ".".join(
                    [a['attr']['transcript_id'], a['attr']['transcript_version']])
                if a['feature'] == 'transcript':
                    # check if the new trancript id is exits, if not insert into gene_block
                    if uniq_trans_id not in gene_block['transcript'].keys():
                        gene_block['transcript'][uniq_trans_id] = {}
                        gene_block['transcript'][uniq_trans_id]['transcript_information'] = a
                else:
                    for k in gene_block['transcript'].keys():
                        if uniq_trans_id == k:
                            general_uniq_id = uniq_trans_id + "." + \
                                a['start'] + "." + a['end'] + \
                                "---" + a['feature']
                            gene_block['transcript'][k][general_uniq_id] = a
 
        """
        do someting for the last gene_block
        """
        process_gene_block(gene_block, target_genelist,mode)
 
 
def process_gene_block(gene_block, target_genelist,mode):
    # build in mode: 
    # 1. all[print all transcripts,the gene name is transcript_name ] 
    # 2. longest
    # 3. first transcript 
     
 
    select_transcript = ['None', 0]
    genename = gene_block['gene_name'].strip()
 
    if genename in target_genelist:
        logging.info(
            "[notice] found {0} in GTF file, start to extract information...".format(genename))
        if mode == "longest":
            for k, v in gene_block['transcript'].items():
                if v['transcript_information']['attr']['gene_biotype'] == 'protein_coding':
                    current_transcript_length = int(
                        v['transcript_information']['end']) - int(v['transcript_information']['start'])
 
                    if current_transcript_length > select_transcript[1]:
                        select_transcript[0] = k
                        select_transcript[1] = current_transcript_length
 
            if select_transcript[0] == 'None':
                logging.warning(
                    "[Waring] do not found protein_coding transcript in {0}".format(genename))
            else:
                print_transcript(gene_block['transcript'][select_transcript[0]])
        elif mode == "all":
            for k, v in gene_block['transcript'].items():
                if v['transcript_information']['attr']['gene_biotype'] == 'protein_coding':
                    print_transcript(gene_block['transcript'][k])
         
 
def print_transcript(trans):
    """
            >EML4_ENST00000318522.5,chr2:42396490-42559688
            1,42396490,42396776
            2,42472645,42472827
    """
    out = []
 
    header_genename = trans['transcript_information']['attr']['gene_name']
    header_ENST = trans['transcript_information']['attr']['transcript_id']
    header_ENST_ver = trans['transcript_information']['attr']['transcript_version']
    header_chr = trans['transcript_information']['chr']
    header_start = trans['transcript_information']['start']
    header_end = trans['transcript_information']['end']
 
    out.append(">"+header_genename + "_"+header_ENST+"."+header_ENST_ver +
               ",chr"+header_chr + ":" + header_start + "-" + header_end)
 
    for k, i in enumerate(trans.keys()):
        if i != 'transcript_information':
            sp = i.split("---")[0].split(".")
            out.append(",".join([str(k+1), sp[2], sp[3]]))
    for j in out:
        print(j)
    print("")
 
 
def parse_oneline(line):
    r = [i.strip() for i in line.strip().split("\t")]
    chro, sourceo, featureo, st, ed, scoreo, strando, frameo, attributeo = r
    attr_dict = parse_attr(attributeo)
    tmp_dict = {}
    tmp_dict['chr'] = chro
    tmp_dict['source'] = sourceo
    tmp_dict['feature'] = featureo
    tmp_dict['start'] = st
    tmp_dict['end'] = ed
    tmp_dict['score'] = scoreo
    tmp_dict['strand'] = strando
    tmp_dict['frame'] = frameo
    tmp_dict['attr'] = attr_dict
    return(tmp_dict)
 
 
def parse_attr(attr):
    a = {i.split(" ")[0].strip(): i.split(" ")[1].rstrip(
        ";").strip('"') for i in attr.split("; ")}
    return(a)
 
 
if __name__ == "__main__":
    # config
    mode = "longest"
    gtf = "/Disk384G/biodatabase/Homo_sapiens.GRCh37.87.gtf"
    genelist = './genelist.txt'
    main(gtf, genelist,mode)

 

GTF格式很不方便,因爲每一個行並不是獨立的,一個gene行下包含多個transcript行,每個transcript 行下包含多個exon和CDS還有其他一堆亂七八糟的行。所以處理GTF文件我覺得最好是將文件解析成一個一個的gene_block。這個代碼對GTF文件實現了簡單的解析,按行讀取,根據gene -- transcript -- [exon/CDS....] 這種結構返回一個字典對象,實現了我的目的了,其他細節沒太考慮。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章